all must go
[rrdtool.git] / libraries / libpng-1.2.0 / pnggccrd.c
1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
2  *
3  * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
4  *
5  *     See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6  *     and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7  *     for Intel's performance analysis of the MMX vs. non-MMX code.
8  *
9  * libpng version 1.2.0 - September 1, 2001
10  * For conditions of distribution and use, see copyright notice in png.h
11  * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12  * Copyright (c) 1998, Intel Corporation
13  *
14  * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15  * Interface to libpng contributed by Gilles Vollant, 1999.
16  * GNU C port by Greg Roelofs, 1999-2001.
17  *
18  * Lines 2350-4300 converted in place with intel2gas 1.3.1:
19  *
20  *   intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
21  *
22  * and then cleaned up by hand.  See http://hermes.terminal.at/intel2gas/ .
23  *
24  * NOTE:  A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25  *        is required to assemble the newer MMX instructions such as movq.
26  *        For djgpp, see
27  *
28  *           ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
29  *
30  *        (or a later version in the same directory).  For Linux, check your
31  *        distribution's web site(s) or try these links:
32  *
33  *           http://rufus.w3.org/linux/RPM/binutils.html
34  *           http://www.debian.org/Packages/stable/devel/binutils.html
35  *           ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
36  *             binutils.tgz
37  *
38  *        For other platforms, see the main GNU site:
39  *
40  *           ftp://ftp.gnu.org/pub/gnu/binutils/
41  *
42  *        Version 2.5.2l.15 is definitely too old...
43  */
44
45 /*
46  * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47  * =====================================
48  *
49  * 19991006:
50  *  - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
51  *
52  * 19991007:
53  *  - additional optimizations (possible or definite):
54  *     x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55  *     - write MMX code for 48-bit case (pixel_bytes == 6)
56  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
57  *        why subtract 8 from width_mmx in the pass 4/5 case?
58  *        (only width_mmx case) (near line 1606)
59  *     x [DONE] replace pixel_bytes within each block with the true
60  *        constant value (or are compilers smart enough to do that?)
61  *     - rewrite all MMX interlacing code so it's aligned with
62  *        the *beginning* of the row buffer, not the end.  This
63  *        would not only allow one to eliminate half of the memory
64  *        writes for odd passes (that is, pass == odd), it may also
65  *        eliminate some unaligned-data-access exceptions (assuming
66  *        there's a penalty for not aligning 64-bit accesses on
67  *        64-bit boundaries).  The only catch is that the "leftover"
68  *        pixel(s) at the end of the row would have to be saved,
69  *        but there are enough unused MMX registers in every case,
70  *        so this is not a problem.  A further benefit is that the
71  *        post-MMX cleanup code (C code) in at least some of the
72  *        cases could be done within the assembler block.
73  *  x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74  *     inconsistent, and don't match the MMX Programmer's Reference
75  *     Manual conventions anyway.  They should be changed to
76  *     "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77  *     was lowest in memory (e.g., corresponding to a left pixel)
78  *     and b7 is the byte that was highest (e.g., a right pixel).
79  *
80  * 19991016:
81  *  - Brennan's Guide notwithstanding, gcc under Linux does *not*
82  *     want globals prefixed by underscores when referencing them--
83  *     i.e., if the variable is const4, then refer to it as const4,
84  *     not _const4.  This seems to be a djgpp-specific requirement.
85  *     Also, such variables apparently *must* be declared outside
86  *     of functions; neither static nor automatic variables work if
87  *     defined within the scope of a single function, but both
88  *     static and truly global (multi-module) variables work fine.
89  *
90  * 19991023:
91  *  - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92  *  - switched from string-concatenation-with-macros to cleaner method of
93  *     renaming global variables for djgpp--i.e., always use prefixes in
94  *     inlined assembler code (== strings) and conditionally rename the
95  *     variables, not the other way around.  Hence _const4, _mask8_0, etc.
96  *
97  * 19991024:
98  *  - fixed mmxsupport()/png_do_read_interlace() first-row bug
99  *     This one was severely weird:  even though mmxsupport() doesn't touch
100  *     ebx (where "row" pointer was stored), it nevertheless managed to zero
101  *     the register (even in static/non-fPIC code--see below), which in turn
102  *     caused png_do_read_interlace() to return prematurely on the first row of
103  *     interlaced images (i.e., without expanding the interlaced pixels).
104  *     Inspection of the generated assembly code didn't turn up any clues,
105  *     although it did point at a minor optimization (i.e., get rid of
106  *     mmx_supported_local variable and just use eax).  Possibly the CPUID
107  *     instruction is more destructive than it looks?  (Not yet checked.)
108  *  - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109  *     listings...  Apparently register spillage has to do with ebx, since
110  *     it's used to index the global offset table.  Commenting it out of the
111  *     input-reg lists in png_combine_row() eliminated compiler barfage, so
112  *     ifdef'd with __PIC__ macro:  if defined, use a global for unmask
113  *
114  * 19991107:
115  *  - verified CPUID clobberage:  12-char string constant ("GenuineIntel",
116  *     "AuthenticAMD", etc.) placed in ebx:ecx:edx.  Still need to polish.
117  *
118  * 19991120:
119  *  - made "diff" variable (now "_dif") global to simplify conversion of
120  *     filtering routines (running out of regs, sigh).  "diff" is still used
121  *     in interlacing routines, however.
122  *  - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123  *     macro determines which is used); original not yet tested.
124  *
125  * 20000213:
126  *  - when compiling with gcc, be sure to use  -fomit-frame-pointer
127  *
128  * 20000319:
129  *  - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130  *     pass == 4 or 5, that caused visible corruption of interlaced images
131  *
132  * 20000623:
133  *  - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134  *     many of the form "forbidden register 0 (ax) was spilled for class AREG."
135  *     This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136  *     Chuck Wilson supplied a patch involving dummy output registers.  See
137  *     http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138  *     for the original (anonymous) SourceForge bug report.
139  *
140  * 20000706:
141  *  - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142  *       pnggccrd.c: In function `png_combine_row':
143  *       pnggccrd.c:525: more than 10 operands in `asm'
144  *       pnggccrd.c:669: more than 10 operands in `asm'
145  *       pnggccrd.c:828: more than 10 operands in `asm'
146  *       pnggccrd.c:994: more than 10 operands in `asm'
147  *       pnggccrd.c:1177: more than 10 operands in `asm'
148  *     They are all the same problem and can be worked around by using the
149  *     global _unmask variable unconditionally, not just in the -fPIC case.
150  *     Reportedly earlier versions of gcc also have the problem with more than
151  *     10 operands; they just don't report it.  Much strangeness ensues, etc.
152  *
153  * 20000729:
154  *  - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155  *     MMX routine); began converting png_read_filter_row_mmx_sub()
156  *  - to finish remaining sections:
157  *     - clean up indentation and comments
158  *     - preload local variables
159  *     - add output and input regs (order of former determines numerical
160  *        mapping of latter)
161  *     - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162  *     - remove "$" from addressing of Shift and Mask variables [20000823]
163  *
164  * 20000731:
165  *  - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
166  *
167  * 20000822:
168  *  - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169  *     shared-library (-fPIC) version!  Code works just fine as part of static
170  *     library.  Damn damn damn damn damn, should have tested that sooner.
171  *     ebx is getting clobbered again (explicitly this time); need to save it
172  *     on stack or rewrite asm code to avoid using it altogether.  Blargh!
173  *
174  * 20000823:
175  *  - first section was trickiest; all remaining sections have ebx -> edx now.
176  *     (-fPIC works again.)  Also added missing underscores to various Shift*
177  *     and *Mask* globals and got rid of leading "$" signs.
178  *
179  * 20000826:
180  *  - added visual separators to help navigate microscopic printed copies
181  *     (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182  *     on png_read_filter_row_mmx_avg()
183  *
184  * 20000828:
185  *  - finished png_read_filter_row_mmx_avg():  only Paeth left! (930 lines...)
186  *     What the hell, did png_read_filter_row_mmx_paeth(), too.  Comments not
187  *     cleaned up/shortened in either routine, but functionality is complete
188  *     and seems to be working fine.
189  *
190  * 20000829:
191  *  - ahhh, figured out last(?) bit of gcc/gas asm-fu:  if register is listed
192  *     as an input reg (with dummy output variables, etc.), then it *cannot*
193  *     also appear in the clobber list or gcc 2.95.2 will barf.  The solution
194  *     is simple enough...
195  *
196  * 20000914:
197  *  - bug in png_read_filter_row_mmx_avg():  16-bit grayscale not handled
198  *     correctly (but 48-bit RGB just fine)
199  *
200  * 20000916:
201  *  - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202  *     - "_ShiftBpp.use = 24;"      should have been   "_ShiftBpp.use = 16;"
203  *     - "_ShiftRem.use = 40;"      should have been   "_ShiftRem.use = 48;"
204  *     - "psllq _ShiftRem, %%mm2"   should have been   "psrlq _ShiftRem, %%mm2"
205  *
206  * 20010101:
207  *  - added new png_init_mmx_flags() function (here only because it needs to
208  *     call mmxsupport(), which should probably become global png_mmxsupport());
209  *     modified other MMX routines to run conditionally (png_ptr->asm_flags)
210  *
211  * 20010103:
212  *  - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
213  *     and made it public; moved png_init_mmx_flags() to png.c as internal func
214  *
215  * 20010104:
216  *  - removed dependency on png_read_filter_row_c() (C code already duplicated
217  *     within MMX version of png_read_filter_row()) so no longer necessary to
218  *     compile it into pngrutil.o
219  *
220  * 20010310:
221  *  - fixed buffer-overrun bug in png_combine_row() C code (non-MMX)
222  *
223  * STILL TO DO:
224  *     - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
225  *     - write MMX code for 48-bit case (pixel_bytes == 6)
226  *     - figure out what's up with 24-bit case (pixel_bytes == 3):
227  *        why subtract 8 from width_mmx in the pass 4/5 case?
228  *        (only width_mmx case) (near line 1606)
229  *     - rewrite all MMX interlacing code so it's aligned with beginning
230  *        of the row buffer, not the end (see 19991007 for details)
231  *     x pick one version of mmxsupport() and get rid of the other
232  *     - add error messages to any remaining bogus default cases
233  *     - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
234  *     x add support for runtime enable/disable/query of various MMX routines
235  */
236
237 #define PNG_INTERNAL
238 #include "png.h"
239
240 #if defined(PNG_USE_PNGGCCRD)
241
242 int PNGAPI png_mmx_support(void);
243
244 #ifdef PNG_USE_LOCAL_ARRAYS
245 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
246 static const int FARDATA png_pass_inc[7]   = {8, 8, 4, 4, 2, 2, 1};
247 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
248 #endif
249
250 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
251 /* djgpp, Win32, and Cygwin add their own underscores to global variables,
252  * so define them without: */
253 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
254 #  define _mmx_supported  mmx_supported
255 #  define _const4         const4
256 #  define _const6         const6
257 #  define _mask8_0        mask8_0
258 #  define _mask16_1       mask16_1
259 #  define _mask16_0       mask16_0
260 #  define _mask24_2       mask24_2
261 #  define _mask24_1       mask24_1
262 #  define _mask24_0       mask24_0
263 #  define _mask32_3       mask32_3
264 #  define _mask32_2       mask32_2
265 #  define _mask32_1       mask32_1
266 #  define _mask32_0       mask32_0
267 #  define _mask48_5       mask48_5
268 #  define _mask48_4       mask48_4
269 #  define _mask48_3       mask48_3
270 #  define _mask48_2       mask48_2
271 #  define _mask48_1       mask48_1
272 #  define _mask48_0       mask48_0
273 #  define _LBCarryMask    LBCarryMask
274 #  define _HBClearMask    HBClearMask
275 #  define _ActiveMask     ActiveMask
276 #  define _ActiveMask2    ActiveMask2
277 #  define _ActiveMaskEnd  ActiveMaskEnd
278 #  define _ShiftBpp       ShiftBpp
279 #  define _ShiftRem       ShiftRem
280 #ifdef PNG_THREAD_UNSAFE_OK
281 #  define _unmask         unmask
282 #  define _FullLength     FullLength
283 #  define _MMXLength      MMXLength
284 #  define _dif            dif
285 #  define _patemp         patemp
286 #  define _pbtemp         pbtemp
287 #  define _pctemp         pctemp
288 #endif
289 #endif
290
291
292 /* These constants are used in the inlined MMX assembly code.
293    Ignore gcc's "At top level: defined but not used" warnings. */
294
295 /* GRR 20000706:  originally _unmask was needed only when compiling with -fPIC,
296  *  since that case uses the %ebx register for indexing the Global Offset Table
297  *  and there were no other registers available.  But gcc 2.95 and later emit
298  *  "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
299  *  in the non-PIC case, so we'll just use the global unconditionally now.
300  */
301 #ifdef PNG_THREAD_UNSAFE_OK
302 static int _unmask;
303 #endif
304
305 static unsigned long long _mask8_0  = 0x0102040810204080LL;
306
307 static unsigned long long _mask16_1 = 0x0101020204040808LL;
308 static unsigned long long _mask16_0 = 0x1010202040408080LL;
309
310 static unsigned long long _mask24_2 = 0x0101010202020404LL;
311 static unsigned long long _mask24_1 = 0x0408080810101020LL;
312 static unsigned long long _mask24_0 = 0x2020404040808080LL;
313
314 static unsigned long long _mask32_3 = 0x0101010102020202LL;
315 static unsigned long long _mask32_2 = 0x0404040408080808LL;
316 static unsigned long long _mask32_1 = 0x1010101020202020LL;
317 static unsigned long long _mask32_0 = 0x4040404080808080LL;
318
319 static unsigned long long _mask48_5 = 0x0101010101010202LL;
320 static unsigned long long _mask48_4 = 0x0202020204040404LL;
321 static unsigned long long _mask48_3 = 0x0404080808080808LL;
322 static unsigned long long _mask48_2 = 0x1010101010102020LL;
323 static unsigned long long _mask48_1 = 0x2020202040404040LL;
324 static unsigned long long _mask48_0 = 0x4040808080808080LL;
325
326 static unsigned long long _const4   = 0x0000000000FFFFFFLL;
327 //static unsigned long long _const5 = 0x000000FFFFFF0000LL;     // NOT USED
328 static unsigned long long _const6   = 0x00000000000000FFLL;
329
330 // These are used in the row-filter routines and should/would be local
331 //  variables if not for gcc addressing limitations.
332 // WARNING: Their presence probably defeats the thread safety of libpng.
333
334 #ifdef PNG_THREAD_UNSAFE_OK
335 static png_uint_32  _FullLength;
336 static png_uint_32  _MMXLength;
337 static int          _dif;
338 static int          _patemp;    // temp variables for Paeth routine
339 static int          _pbtemp;
340 static int          _pctemp;
341 #endif
342
343 void /* PRIVATE */
344 png_squelch_warnings(void)
345 {
346 #ifdef PNG_THREAD_UNSAFE_OK
347    _dif = _dif;
348    _patemp = _patemp;
349    _pbtemp = _pbtemp;
350    _pctemp = _pctemp;
351    _MMXLength = _MMXLength;
352 #endif
353    _const4  = _const4;
354    _const6  = _const6;
355    _mask8_0  = _mask8_0;
356    _mask16_1 = _mask16_1;
357    _mask16_0 = _mask16_0;
358    _mask24_2 = _mask24_2;
359    _mask24_1 = _mask24_1;
360    _mask24_0 = _mask24_0;
361    _mask32_3 = _mask32_3;
362    _mask32_2 = _mask32_2;
363    _mask32_1 = _mask32_1;
364    _mask32_0 = _mask32_0;
365    _mask48_5 = _mask48_5;
366    _mask48_4 = _mask48_4;
367    _mask48_3 = _mask48_3;
368    _mask48_2 = _mask48_2;
369    _mask48_1 = _mask48_1;
370    _mask48_0 = _mask48_0;
371 }
372 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
373
374
375 static int _mmx_supported = 2;
376
377 /*===========================================================================*/
378 /*                                                                           */
379 /*                       P N G _ C O M B I N E _ R O W                       */
380 /*                                                                           */
381 /*===========================================================================*/
382
383 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
384
385 #define BPP2  2
386 #define BPP3  3         /* bytes per pixel (a.k.a. pixel_bytes) */
387 #define BPP4  4
388 #define BPP6  6         /* (defined only to help avoid cut-and-paste errors) */
389 #define BPP8  8
390
391 /* Combines the row recently read in with the previous row.
392    This routine takes care of alpha and transparency if requested.
393    This routine also handles the two methods of progressive display
394    of interlaced images, depending on the mask value.
395    The mask value describes which pixels are to be combined with
396    the row.  The pattern always repeats every 8 pixels, so just 8
397    bits are needed.  A one indicates the pixel is to be combined; a
398    zero indicates the pixel is to be skipped.  This is in addition
399    to any alpha or transparency value associated with the pixel.
400    If you want all pixels to be combined, pass 0xff (255) in mask. */
401
402 /* Use this routine for the x86 platform - it uses a faster MMX routine
403    if the machine supports MMX. */
404
405 void /* PRIVATE */
406 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
407 {
408    png_debug(1, "in png_combine_row (pnggccrd.c)\n");
409
410 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
411    if (_mmx_supported == 2) {
412        /* this should have happened in png_init_mmx_flags() already */
413        png_warning(png_ptr, "asm_flags may not have been initialized");
414        png_mmx_support();
415    }
416 #endif
417
418    if (mask == 0xff)
419    {
420       png_debug(2,"mask == 0xff:  doing single png_memcpy()\n");
421       png_memcpy(row, png_ptr->row_buf + 1,
422        (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
423    }
424    else   /* (png_combine_row() is never called with mask == 0) */
425    {
426       switch (png_ptr->row_info.pixel_depth)
427       {
428          case 1:        /* png_ptr->row_info.pixel_depth */
429          {
430             png_bytep sp;
431             png_bytep dp;
432             int s_inc, s_start, s_end;
433             int m;
434             int shift;
435             png_uint_32 i;
436
437             sp = png_ptr->row_buf + 1;
438             dp = row;
439             m = 0x80;
440 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
441             if (png_ptr->transformations & PNG_PACKSWAP)
442             {
443                 s_start = 0;
444                 s_end = 7;
445                 s_inc = 1;
446             }
447             else
448 #endif
449             {
450                 s_start = 7;
451                 s_end = 0;
452                 s_inc = -1;
453             }
454
455             shift = s_start;
456
457             for (i = 0; i < png_ptr->width; i++)
458             {
459                if (m & mask)
460                {
461                   int value;
462
463                   value = (*sp >> shift) & 0x1;
464                   *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
465                   *dp |= (png_byte)(value << shift);
466                }
467
468                if (shift == s_end)
469                {
470                   shift = s_start;
471                   sp++;
472                   dp++;
473                }
474                else
475                   shift += s_inc;
476
477                if (m == 1)
478                   m = 0x80;
479                else
480                   m >>= 1;
481             }
482             break;
483          }
484
485          case 2:        /* png_ptr->row_info.pixel_depth */
486          {
487             png_bytep sp;
488             png_bytep dp;
489             int s_start, s_end, s_inc;
490             int m;
491             int shift;
492             png_uint_32 i;
493             int value;
494
495             sp = png_ptr->row_buf + 1;
496             dp = row;
497             m = 0x80;
498 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
499             if (png_ptr->transformations & PNG_PACKSWAP)
500             {
501                s_start = 0;
502                s_end = 6;
503                s_inc = 2;
504             }
505             else
506 #endif
507             {
508                s_start = 6;
509                s_end = 0;
510                s_inc = -2;
511             }
512
513             shift = s_start;
514
515             for (i = 0; i < png_ptr->width; i++)
516             {
517                if (m & mask)
518                {
519                   value = (*sp >> shift) & 0x3;
520                   *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
521                   *dp |= (png_byte)(value << shift);
522                }
523
524                if (shift == s_end)
525                {
526                   shift = s_start;
527                   sp++;
528                   dp++;
529                }
530                else
531                   shift += s_inc;
532                if (m == 1)
533                   m = 0x80;
534                else
535                   m >>= 1;
536             }
537             break;
538          }
539
540          case 4:        /* png_ptr->row_info.pixel_depth */
541          {
542             png_bytep sp;
543             png_bytep dp;
544             int s_start, s_end, s_inc;
545             int m;
546             int shift;
547             png_uint_32 i;
548             int value;
549
550             sp = png_ptr->row_buf + 1;
551             dp = row;
552             m = 0x80;
553 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
554             if (png_ptr->transformations & PNG_PACKSWAP)
555             {
556                s_start = 0;
557                s_end = 4;
558                s_inc = 4;
559             }
560             else
561 #endif
562             {
563                s_start = 4;
564                s_end = 0;
565                s_inc = -4;
566             }
567             shift = s_start;
568
569             for (i = 0; i < png_ptr->width; i++)
570             {
571                if (m & mask)
572                {
573                   value = (*sp >> shift) & 0xf;
574                   *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
575                   *dp |= (png_byte)(value << shift);
576                }
577
578                if (shift == s_end)
579                {
580                   shift = s_start;
581                   sp++;
582                   dp++;
583                }
584                else
585                   shift += s_inc;
586                if (m == 1)
587                   m = 0x80;
588                else
589                   m >>= 1;
590             }
591             break;
592          }
593
594          case 8:        /* png_ptr->row_info.pixel_depth */
595          {
596             png_bytep srcptr;
597             png_bytep dstptr;
598
599 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
600             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
601                 /* && _mmx_supported */ )
602             {
603                png_uint_32 len;
604                int diff;
605                int dummy_value_a;   // fix 'forbidden register spilled' error
606                int dummy_value_d;
607                int dummy_value_c;
608                int dummy_value_S;
609                int dummy_value_D;
610                _unmask = ~mask;            // global variable for -fPIC version
611                srcptr = png_ptr->row_buf + 1;
612                dstptr = row;
613                len  = png_ptr->width &~7;  // reduce to multiple of 8
614                diff = (int) (png_ptr->width & 7);  // amount lost
615
616                __asm__ __volatile__ (
617                   "movd      _unmask, %%mm7  \n\t" // load bit pattern
618                   "psubb     %%mm6, %%mm6    \n\t" // zero mm6
619                   "punpcklbw %%mm7, %%mm7    \n\t"
620                   "punpcklwd %%mm7, %%mm7    \n\t"
621                   "punpckldq %%mm7, %%mm7    \n\t" // fill reg with 8 masks
622
623                   "movq      _mask8_0, %%mm0 \n\t"
624                   "pand      %%mm7, %%mm0    \n\t" // nonzero if keep byte
625                   "pcmpeqb   %%mm6, %%mm0    \n\t" // zeros->1s, v versa
626
627 // preload        "movl      len, %%ecx      \n\t" // load length of line
628 // preload        "movl      srcptr, %%esi   \n\t" // load source
629 // preload        "movl      dstptr, %%edi   \n\t" // load dest
630
631                   "cmpl      $0, %%ecx       \n\t" // len == 0 ?
632                   "je        mainloop8end    \n\t"
633
634                 "mainloop8:                  \n\t"
635                   "movq      (%%esi), %%mm4  \n\t" // *srcptr
636                   "pand      %%mm0, %%mm4    \n\t"
637                   "movq      %%mm0, %%mm6    \n\t"
638                   "pandn     (%%edi), %%mm6  \n\t" // *dstptr
639                   "por       %%mm6, %%mm4    \n\t"
640                   "movq      %%mm4, (%%edi)  \n\t"
641                   "addl      $8, %%esi       \n\t" // inc by 8 bytes processed
642                   "addl      $8, %%edi       \n\t"
643                   "subl      $8, %%ecx       \n\t" // dec by 8 pixels processed
644                   "ja        mainloop8       \n\t"
645
646                 "mainloop8end:               \n\t"
647 // preload        "movl      diff, %%ecx     \n\t" // (diff is in eax)
648                   "movl      %%eax, %%ecx    \n\t"
649                   "cmpl      $0, %%ecx       \n\t"
650                   "jz        end8            \n\t"
651 // preload        "movl      mask, %%edx     \n\t"
652                   "sall      $24, %%edx      \n\t" // make low byte, high byte
653
654                 "secondloop8:                \n\t"
655                   "sall      %%edx           \n\t" // move high bit to CF
656                   "jnc       skip8           \n\t" // if CF = 0
657                   "movb      (%%esi), %%al   \n\t"
658                   "movb      %%al, (%%edi)   \n\t"
659
660                 "skip8:                      \n\t"
661                   "incl      %%esi           \n\t"
662                   "incl      %%edi           \n\t"
663                   "decl      %%ecx           \n\t"
664                   "jnz       secondloop8     \n\t"
665
666                 "end8:                       \n\t"
667                   "EMMS                      \n\t"  // DONE
668
669                   : "=a" (dummy_value_a),           // output regs (dummy)
670                     "=d" (dummy_value_d),
671                     "=c" (dummy_value_c),
672                     "=S" (dummy_value_S),
673                     "=D" (dummy_value_D)
674
675                   : "3" (srcptr),      // esi       // input regs
676                     "4" (dstptr),      // edi
677                     "0" (diff),        // eax
678 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
679                     "2" (len),         // ecx
680                     "1" (mask)         // edx
681
682 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
683                   : "%mm0", "%mm4", "%mm6", "%mm7"  // clobber list
684 #endif
685                );
686             }
687             else /* mmx _not supported - Use modified C routine */
688 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
689             {
690                register png_uint_32 i;
691                png_uint_32 initial_val = png_pass_start[png_ptr->pass];
692                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
693                register int stride = png_pass_inc[png_ptr->pass];
694                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
695                register int rep_bytes = png_pass_width[png_ptr->pass];
696                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
697                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
698                int diff = (int) (png_ptr->width & 7); /* amount lost */
699                register png_uint_32 final_val = len;  /* GRR bugfix */
700
701                srcptr = png_ptr->row_buf + 1 + initial_val;
702                dstptr = row + initial_val;
703
704                for (i = initial_val; i < final_val; i += stride)
705                {
706                   png_memcpy(dstptr, srcptr, rep_bytes);
707                   srcptr += stride;
708                   dstptr += stride;
709                }
710                if (diff)  /* number of leftover pixels:  3 for pngtest */
711                {
712                   final_val+=diff /* *BPP1 */ ;
713                   for (; i < final_val; i += stride)
714                   {
715                      if (rep_bytes > (int)(final_val-i))
716                         rep_bytes = (int)(final_val-i);
717                      png_memcpy(dstptr, srcptr, rep_bytes);
718                      srcptr += stride;
719                      dstptr += stride;
720                   }
721                }
722
723             } /* end of else (_mmx_supported) */
724
725             break;
726          }       /* end 8 bpp */
727
728          case 16:       /* png_ptr->row_info.pixel_depth */
729          {
730             png_bytep srcptr;
731             png_bytep dstptr;
732
733 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
734             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
735                 /* && _mmx_supported */ )
736             {
737                png_uint_32 len;
738                int diff;
739                int dummy_value_a;   // fix 'forbidden register spilled' error
740                int dummy_value_d;
741                int dummy_value_c;
742                int dummy_value_S;
743                int dummy_value_D;
744                _unmask = ~mask;            // global variable for -fPIC version
745                srcptr = png_ptr->row_buf + 1;
746                dstptr = row;
747                len  = png_ptr->width &~7;  // reduce to multiple of 8
748                diff = (int) (png_ptr->width & 7); // amount lost //
749
750                __asm__ __volatile__ (
751                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
752                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
753                   "punpcklbw %%mm7, %%mm7     \n\t"
754                   "punpcklwd %%mm7, %%mm7     \n\t"
755                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
756
757                   "movq      _mask16_0, %%mm0 \n\t"
758                   "movq      _mask16_1, %%mm1 \n\t"
759
760                   "pand      %%mm7, %%mm0     \n\t"
761                   "pand      %%mm7, %%mm1     \n\t"
762
763                   "pcmpeqb   %%mm6, %%mm0     \n\t"
764                   "pcmpeqb   %%mm6, %%mm1     \n\t"
765
766 // preload        "movl      len, %%ecx       \n\t" // load length of line
767 // preload        "movl      srcptr, %%esi    \n\t" // load source
768 // preload        "movl      dstptr, %%edi    \n\t" // load dest
769
770                   "cmpl      $0, %%ecx        \n\t"
771                   "jz        mainloop16end    \n\t"
772
773                 "mainloop16:                  \n\t"
774                   "movq      (%%esi), %%mm4   \n\t"
775                   "pand      %%mm0, %%mm4     \n\t"
776                   "movq      %%mm0, %%mm6     \n\t"
777                   "movq      (%%edi), %%mm7   \n\t"
778                   "pandn     %%mm7, %%mm6     \n\t"
779                   "por       %%mm6, %%mm4     \n\t"
780                   "movq      %%mm4, (%%edi)   \n\t"
781
782                   "movq      8(%%esi), %%mm5  \n\t"
783                   "pand      %%mm1, %%mm5     \n\t"
784                   "movq      %%mm1, %%mm7     \n\t"
785                   "movq      8(%%edi), %%mm6  \n\t"
786                   "pandn     %%mm6, %%mm7     \n\t"
787                   "por       %%mm7, %%mm5     \n\t"
788                   "movq      %%mm5, 8(%%edi)  \n\t"
789
790                   "addl      $16, %%esi       \n\t" // inc by 16 bytes processed
791                   "addl      $16, %%edi       \n\t"
792                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
793                   "ja        mainloop16       \n\t"
794
795                 "mainloop16end:               \n\t"
796 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
797                   "movl      %%eax, %%ecx     \n\t"
798                   "cmpl      $0, %%ecx        \n\t"
799                   "jz        end16            \n\t"
800 // preload        "movl      mask, %%edx      \n\t"
801                   "sall      $24, %%edx       \n\t" // make low byte, high byte
802
803                 "secondloop16:                \n\t"
804                   "sall      %%edx            \n\t" // move high bit to CF
805                   "jnc       skip16           \n\t" // if CF = 0
806                   "movw      (%%esi), %%ax    \n\t"
807                   "movw      %%ax, (%%edi)    \n\t"
808
809                 "skip16:                      \n\t"
810                   "addl      $2, %%esi        \n\t"
811                   "addl      $2, %%edi        \n\t"
812                   "decl      %%ecx            \n\t"
813                   "jnz       secondloop16     \n\t"
814
815                 "end16:                       \n\t"
816                   "EMMS                       \n\t" // DONE
817
818                   : "=a" (dummy_value_a),           // output regs (dummy)
819                     "=c" (dummy_value_c),
820                     "=d" (dummy_value_d),
821                     "=S" (dummy_value_S),
822                     "=D" (dummy_value_D)
823
824                   : "0" (diff),        // eax       // input regs
825 // was (unmask)     " "    RESERVED    // ebx       // Global Offset Table idx
826                     "1" (len),         // ecx
827                     "2" (mask),        // edx
828                     "3" (srcptr),      // esi
829                     "4" (dstptr)       // edi
830
831 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
832                   : "%mm0", "%mm1", "%mm4"          // clobber list
833                   , "%mm5", "%mm6", "%mm7"
834 #endif
835                );
836             }
837             else /* mmx _not supported - Use modified C routine */
838 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
839             {
840                register png_uint_32 i;
841                png_uint_32 initial_val = BPP2 * png_pass_start[png_ptr->pass];
842                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
843                register int stride = BPP2 * png_pass_inc[png_ptr->pass];
844                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
845                register int rep_bytes = BPP2 * png_pass_width[png_ptr->pass];
846                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
847                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
848                int diff = (int) (png_ptr->width & 7); /* amount lost */
849                register png_uint_32 final_val = BPP2 * len;   /* GRR bugfix */
850
851                srcptr = png_ptr->row_buf + 1 + initial_val;
852                dstptr = row + initial_val;
853
854                for (i = initial_val; i < final_val; i += stride)
855                {
856                   png_memcpy(dstptr, srcptr, rep_bytes);
857                   srcptr += stride;
858                   dstptr += stride;
859                }
860                if (diff)  /* number of leftover pixels:  3 for pngtest */
861                {
862                   final_val+=diff*BPP2;
863                   for (; i < final_val; i += stride)
864                   {
865                      if (rep_bytes > (int)(final_val-i))
866                         rep_bytes = (int)(final_val-i);
867                      png_memcpy(dstptr, srcptr, rep_bytes);
868                      srcptr += stride;
869                      dstptr += stride;
870                   }
871                }
872             } /* end of else (_mmx_supported) */
873
874             break;
875          }       /* end 16 bpp */
876
877          case 24:       /* png_ptr->row_info.pixel_depth */
878          {
879             png_bytep srcptr;
880             png_bytep dstptr;
881
882 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
883             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
884                 /* && _mmx_supported */ )
885             {
886                png_uint_32 len;
887                int diff;
888                int dummy_value_a;   // fix 'forbidden register spilled' error
889                int dummy_value_d;
890                int dummy_value_c;
891                int dummy_value_S;
892                int dummy_value_D;
893                _unmask = ~mask;            // global variable for -fPIC version
894                srcptr = png_ptr->row_buf + 1;
895                dstptr = row;
896                len  = png_ptr->width &~7;  // reduce to multiple of 8
897                diff = (int) (png_ptr->width & 7); // amount lost //
898
899                __asm__ __volatile__ (
900                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
901                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
902                   "punpcklbw %%mm7, %%mm7     \n\t"
903                   "punpcklwd %%mm7, %%mm7     \n\t"
904                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
905
906                   "movq      _mask24_0, %%mm0 \n\t"
907                   "movq      _mask24_1, %%mm1 \n\t"
908                   "movq      _mask24_2, %%mm2 \n\t"
909
910                   "pand      %%mm7, %%mm0     \n\t"
911                   "pand      %%mm7, %%mm1     \n\t"
912                   "pand      %%mm7, %%mm2     \n\t"
913
914                   "pcmpeqb   %%mm6, %%mm0     \n\t"
915                   "pcmpeqb   %%mm6, %%mm1     \n\t"
916                   "pcmpeqb   %%mm6, %%mm2     \n\t"
917
918 // preload        "movl      len, %%ecx       \n\t" // load length of line
919 // preload        "movl      srcptr, %%esi    \n\t" // load source
920 // preload        "movl      dstptr, %%edi    \n\t" // load dest
921
922                   "cmpl      $0, %%ecx        \n\t"
923                   "jz        mainloop24end    \n\t"
924
925                 "mainloop24:                  \n\t"
926                   "movq      (%%esi), %%mm4   \n\t"
927                   "pand      %%mm0, %%mm4     \n\t"
928                   "movq      %%mm0, %%mm6     \n\t"
929                   "movq      (%%edi), %%mm7   \n\t"
930                   "pandn     %%mm7, %%mm6     \n\t"
931                   "por       %%mm6, %%mm4     \n\t"
932                   "movq      %%mm4, (%%edi)   \n\t"
933
934                   "movq      8(%%esi), %%mm5  \n\t"
935                   "pand      %%mm1, %%mm5     \n\t"
936                   "movq      %%mm1, %%mm7     \n\t"
937                   "movq      8(%%edi), %%mm6  \n\t"
938                   "pandn     %%mm6, %%mm7     \n\t"
939                   "por       %%mm7, %%mm5     \n\t"
940                   "movq      %%mm5, 8(%%edi)  \n\t"
941
942                   "movq      16(%%esi), %%mm6 \n\t"
943                   "pand      %%mm2, %%mm6     \n\t"
944                   "movq      %%mm2, %%mm4     \n\t"
945                   "movq      16(%%edi), %%mm7 \n\t"
946                   "pandn     %%mm7, %%mm4     \n\t"
947                   "por       %%mm4, %%mm6     \n\t"
948                   "movq      %%mm6, 16(%%edi) \n\t"
949
950                   "addl      $24, %%esi       \n\t" // inc by 24 bytes processed
951                   "addl      $24, %%edi       \n\t"
952                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
953
954                   "ja        mainloop24       \n\t"
955
956                 "mainloop24end:               \n\t"
957 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
958                   "movl      %%eax, %%ecx     \n\t"
959                   "cmpl      $0, %%ecx        \n\t"
960                   "jz        end24            \n\t"
961 // preload        "movl      mask, %%edx      \n\t"
962                   "sall      $24, %%edx       \n\t" // make low byte, high byte
963
964                 "secondloop24:                \n\t"
965                   "sall      %%edx            \n\t" // move high bit to CF
966                   "jnc       skip24           \n\t" // if CF = 0
967                   "movw      (%%esi), %%ax    \n\t"
968                   "movw      %%ax, (%%edi)    \n\t"
969                   "xorl      %%eax, %%eax     \n\t"
970                   "movb      2(%%esi), %%al   \n\t"
971                   "movb      %%al, 2(%%edi)   \n\t"
972
973                 "skip24:                      \n\t"
974                   "addl      $3, %%esi        \n\t"
975                   "addl      $3, %%edi        \n\t"
976                   "decl      %%ecx            \n\t"
977                   "jnz       secondloop24     \n\t"
978
979                 "end24:                       \n\t"
980                   "EMMS                       \n\t" // DONE
981
982                   : "=a" (dummy_value_a),           // output regs (dummy)
983                     "=d" (dummy_value_d),
984                     "=c" (dummy_value_c),
985                     "=S" (dummy_value_S),
986                     "=D" (dummy_value_D)
987
988                   : "3" (srcptr),      // esi       // input regs
989                     "4" (dstptr),      // edi
990                     "0" (diff),        // eax
991 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
992                     "2" (len),         // ecx
993                     "1" (mask)         // edx
994
995 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
996                   : "%mm0", "%mm1", "%mm2"          // clobber list
997                   , "%mm4", "%mm5", "%mm6", "%mm7"
998 #endif
999                );
1000             }
1001             else /* mmx _not supported - Use modified C routine */
1002 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1003             {
1004                register png_uint_32 i;
1005                png_uint_32 initial_val = BPP3 * png_pass_start[png_ptr->pass];
1006                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1007                register int stride = BPP3 * png_pass_inc[png_ptr->pass];
1008                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1009                register int rep_bytes = BPP3 * png_pass_width[png_ptr->pass];
1010                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1011                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1012                int diff = (int) (png_ptr->width & 7); /* amount lost */
1013                register png_uint_32 final_val = BPP3 * len;   /* GRR bugfix */
1014
1015                srcptr = png_ptr->row_buf + 1 + initial_val;
1016                dstptr = row + initial_val;
1017
1018                for (i = initial_val; i < final_val; i += stride)
1019                {
1020                   png_memcpy(dstptr, srcptr, rep_bytes);
1021                   srcptr += stride;
1022                   dstptr += stride;
1023                }
1024                if (diff)  /* number of leftover pixels:  3 for pngtest */
1025                {
1026                   final_val+=diff*BPP3;
1027                   for (; i < final_val; i += stride)
1028                   {
1029                      if (rep_bytes > (int)(final_val-i))
1030                         rep_bytes = (int)(final_val-i);
1031                      png_memcpy(dstptr, srcptr, rep_bytes);
1032                      srcptr += stride;
1033                      dstptr += stride;
1034                   }
1035                }
1036             } /* end of else (_mmx_supported) */
1037
1038             break;
1039          }       /* end 24 bpp */
1040
1041          case 32:       /* png_ptr->row_info.pixel_depth */
1042          {
1043             png_bytep srcptr;
1044             png_bytep dstptr;
1045
1046 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1047             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1048                 /* && _mmx_supported */ )
1049             {
1050                png_uint_32 len;
1051                int diff;
1052                int dummy_value_a;   // fix 'forbidden register spilled' error
1053                int dummy_value_d;
1054                int dummy_value_c;
1055                int dummy_value_S;
1056                int dummy_value_D;
1057                _unmask = ~mask;            // global variable for -fPIC version
1058                srcptr = png_ptr->row_buf + 1;
1059                dstptr = row;
1060                len  = png_ptr->width &~7;  // reduce to multiple of 8
1061                diff = (int) (png_ptr->width & 7); // amount lost //
1062
1063                __asm__ __volatile__ (
1064                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1065                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1066                   "punpcklbw %%mm7, %%mm7     \n\t"
1067                   "punpcklwd %%mm7, %%mm7     \n\t"
1068                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1069
1070                   "movq      _mask32_0, %%mm0 \n\t"
1071                   "movq      _mask32_1, %%mm1 \n\t"
1072                   "movq      _mask32_2, %%mm2 \n\t"
1073                   "movq      _mask32_3, %%mm3 \n\t"
1074
1075                   "pand      %%mm7, %%mm0     \n\t"
1076                   "pand      %%mm7, %%mm1     \n\t"
1077                   "pand      %%mm7, %%mm2     \n\t"
1078                   "pand      %%mm7, %%mm3     \n\t"
1079
1080                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1081                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1082                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1083                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1084
1085 // preload        "movl      len, %%ecx       \n\t" // load length of line
1086 // preload        "movl      srcptr, %%esi    \n\t" // load source
1087 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1088
1089                   "cmpl      $0, %%ecx        \n\t" // lcr
1090                   "jz        mainloop32end    \n\t"
1091
1092                 "mainloop32:                  \n\t"
1093                   "movq      (%%esi), %%mm4   \n\t"
1094                   "pand      %%mm0, %%mm4     \n\t"
1095                   "movq      %%mm0, %%mm6     \n\t"
1096                   "movq      (%%edi), %%mm7   \n\t"
1097                   "pandn     %%mm7, %%mm6     \n\t"
1098                   "por       %%mm6, %%mm4     \n\t"
1099                   "movq      %%mm4, (%%edi)   \n\t"
1100
1101                   "movq      8(%%esi), %%mm5  \n\t"
1102                   "pand      %%mm1, %%mm5     \n\t"
1103                   "movq      %%mm1, %%mm7     \n\t"
1104                   "movq      8(%%edi), %%mm6  \n\t"
1105                   "pandn     %%mm6, %%mm7     \n\t"
1106                   "por       %%mm7, %%mm5     \n\t"
1107                   "movq      %%mm5, 8(%%edi)  \n\t"
1108
1109                   "movq      16(%%esi), %%mm6 \n\t"
1110                   "pand      %%mm2, %%mm6     \n\t"
1111                   "movq      %%mm2, %%mm4     \n\t"
1112                   "movq      16(%%edi), %%mm7 \n\t"
1113                   "pandn     %%mm7, %%mm4     \n\t"
1114                   "por       %%mm4, %%mm6     \n\t"
1115                   "movq      %%mm6, 16(%%edi) \n\t"
1116
1117                   "movq      24(%%esi), %%mm7 \n\t"
1118                   "pand      %%mm3, %%mm7     \n\t"
1119                   "movq      %%mm3, %%mm5     \n\t"
1120                   "movq      24(%%edi), %%mm4 \n\t"
1121                   "pandn     %%mm4, %%mm5     \n\t"
1122                   "por       %%mm5, %%mm7     \n\t"
1123                   "movq      %%mm7, 24(%%edi) \n\t"
1124
1125                   "addl      $32, %%esi       \n\t" // inc by 32 bytes processed
1126                   "addl      $32, %%edi       \n\t"
1127                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1128                   "ja        mainloop32       \n\t"
1129
1130                 "mainloop32end:               \n\t"
1131 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1132                   "movl      %%eax, %%ecx     \n\t"
1133                   "cmpl      $0, %%ecx        \n\t"
1134                   "jz        end32            \n\t"
1135 // preload        "movl      mask, %%edx      \n\t"
1136                   "sall      $24, %%edx       \n\t" // low byte => high byte
1137
1138                 "secondloop32:                \n\t"
1139                   "sall      %%edx            \n\t" // move high bit to CF
1140                   "jnc       skip32           \n\t" // if CF = 0
1141                   "movl      (%%esi), %%eax   \n\t"
1142                   "movl      %%eax, (%%edi)   \n\t"
1143
1144                 "skip32:                      \n\t"
1145                   "addl      $4, %%esi        \n\t"
1146                   "addl      $4, %%edi        \n\t"
1147                   "decl      %%ecx            \n\t"
1148                   "jnz       secondloop32     \n\t"
1149
1150                 "end32:                       \n\t"
1151                   "EMMS                       \n\t" // DONE
1152
1153                   : "=a" (dummy_value_a),           // output regs (dummy)
1154                     "=d" (dummy_value_d),
1155                     "=c" (dummy_value_c),
1156                     "=S" (dummy_value_S),
1157                     "=D" (dummy_value_D)
1158
1159                   : "3" (srcptr),      // esi       // input regs
1160                     "4" (dstptr),      // edi
1161                     "0" (diff),        // eax
1162 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1163                     "2" (len),         // ecx
1164                     "1" (mask)         // edx
1165
1166 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1167                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1168                   , "%mm4", "%mm5", "%mm6", "%mm7"
1169 #endif
1170                );
1171             }
1172             else /* mmx _not supported - Use modified C routine */
1173 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1174             {
1175                register png_uint_32 i;
1176                png_uint_32 initial_val = BPP4 * png_pass_start[png_ptr->pass];
1177                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1178                register int stride = BPP4 * png_pass_inc[png_ptr->pass];
1179                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1180                register int rep_bytes = BPP4 * png_pass_width[png_ptr->pass];
1181                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1182                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1183                int diff = (int) (png_ptr->width & 7); /* amount lost */
1184                register png_uint_32 final_val = BPP4 * len;   /* GRR bugfix */
1185
1186                srcptr = png_ptr->row_buf + 1 + initial_val;
1187                dstptr = row + initial_val;
1188
1189                for (i = initial_val; i < final_val; i += stride)
1190                {
1191                   png_memcpy(dstptr, srcptr, rep_bytes);
1192                   srcptr += stride;
1193                   dstptr += stride;
1194                }
1195                if (diff)  /* number of leftover pixels:  3 for pngtest */
1196                {
1197                   final_val+=diff*BPP4;
1198                   for (; i < final_val; i += stride)
1199                   {
1200                      if (rep_bytes > (int)(final_val-i))
1201                         rep_bytes = (int)(final_val-i);
1202                      png_memcpy(dstptr, srcptr, rep_bytes);
1203                      srcptr += stride;
1204                      dstptr += stride;
1205                   }
1206                }
1207             } /* end of else (_mmx_supported) */
1208
1209             break;
1210          }       /* end 32 bpp */
1211
1212          case 48:       /* png_ptr->row_info.pixel_depth */
1213          {
1214             png_bytep srcptr;
1215             png_bytep dstptr;
1216
1217 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
1218             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_COMBINE_ROW)
1219                 /* && _mmx_supported */ )
1220             {
1221                png_uint_32 len;
1222                int diff;
1223                int dummy_value_a;   // fix 'forbidden register spilled' error
1224                int dummy_value_d;
1225                int dummy_value_c;
1226                int dummy_value_S;
1227                int dummy_value_D;
1228                _unmask = ~mask;            // global variable for -fPIC version
1229                srcptr = png_ptr->row_buf + 1;
1230                dstptr = row;
1231                len  = png_ptr->width &~7;  // reduce to multiple of 8
1232                diff = (int) (png_ptr->width & 7); // amount lost //
1233
1234                __asm__ __volatile__ (
1235                   "movd      _unmask, %%mm7   \n\t" // load bit pattern
1236                   "psubb     %%mm6, %%mm6     \n\t" // zero mm6
1237                   "punpcklbw %%mm7, %%mm7     \n\t"
1238                   "punpcklwd %%mm7, %%mm7     \n\t"
1239                   "punpckldq %%mm7, %%mm7     \n\t" // fill reg with 8 masks
1240
1241                   "movq      _mask48_0, %%mm0 \n\t"
1242                   "movq      _mask48_1, %%mm1 \n\t"
1243                   "movq      _mask48_2, %%mm2 \n\t"
1244                   "movq      _mask48_3, %%mm3 \n\t"
1245                   "movq      _mask48_4, %%mm4 \n\t"
1246                   "movq      _mask48_5, %%mm5 \n\t"
1247
1248                   "pand      %%mm7, %%mm0     \n\t"
1249                   "pand      %%mm7, %%mm1     \n\t"
1250                   "pand      %%mm7, %%mm2     \n\t"
1251                   "pand      %%mm7, %%mm3     \n\t"
1252                   "pand      %%mm7, %%mm4     \n\t"
1253                   "pand      %%mm7, %%mm5     \n\t"
1254
1255                   "pcmpeqb   %%mm6, %%mm0     \n\t"
1256                   "pcmpeqb   %%mm6, %%mm1     \n\t"
1257                   "pcmpeqb   %%mm6, %%mm2     \n\t"
1258                   "pcmpeqb   %%mm6, %%mm3     \n\t"
1259                   "pcmpeqb   %%mm6, %%mm4     \n\t"
1260                   "pcmpeqb   %%mm6, %%mm5     \n\t"
1261
1262 // preload        "movl      len, %%ecx       \n\t" // load length of line
1263 // preload        "movl      srcptr, %%esi    \n\t" // load source
1264 // preload        "movl      dstptr, %%edi    \n\t" // load dest
1265
1266                   "cmpl      $0, %%ecx        \n\t"
1267                   "jz        mainloop48end    \n\t"
1268
1269                 "mainloop48:                  \n\t"
1270                   "movq      (%%esi), %%mm7   \n\t"
1271                   "pand      %%mm0, %%mm7     \n\t"
1272                   "movq      %%mm0, %%mm6     \n\t"
1273                   "pandn     (%%edi), %%mm6   \n\t"
1274                   "por       %%mm6, %%mm7     \n\t"
1275                   "movq      %%mm7, (%%edi)   \n\t"
1276
1277                   "movq      8(%%esi), %%mm6  \n\t"
1278                   "pand      %%mm1, %%mm6     \n\t"
1279                   "movq      %%mm1, %%mm7     \n\t"
1280                   "pandn     8(%%edi), %%mm7  \n\t"
1281                   "por       %%mm7, %%mm6     \n\t"
1282                   "movq      %%mm6, 8(%%edi)  \n\t"
1283
1284                   "movq      16(%%esi), %%mm6 \n\t"
1285                   "pand      %%mm2, %%mm6     \n\t"
1286                   "movq      %%mm2, %%mm7     \n\t"
1287                   "pandn     16(%%edi), %%mm7 \n\t"
1288                   "por       %%mm7, %%mm6     \n\t"
1289                   "movq      %%mm6, 16(%%edi) \n\t"
1290
1291                   "movq      24(%%esi), %%mm7 \n\t"
1292                   "pand      %%mm3, %%mm7     \n\t"
1293                   "movq      %%mm3, %%mm6     \n\t"
1294                   "pandn     24(%%edi), %%mm6 \n\t"
1295                   "por       %%mm6, %%mm7     \n\t"
1296                   "movq      %%mm7, 24(%%edi) \n\t"
1297
1298                   "movq      32(%%esi), %%mm6 \n\t"
1299                   "pand      %%mm4, %%mm6     \n\t"
1300                   "movq      %%mm4, %%mm7     \n\t"
1301                   "pandn     32(%%edi), %%mm7 \n\t"
1302                   "por       %%mm7, %%mm6     \n\t"
1303                   "movq      %%mm6, 32(%%edi) \n\t"
1304
1305                   "movq      40(%%esi), %%mm7 \n\t"
1306                   "pand      %%mm5, %%mm7     \n\t"
1307                   "movq      %%mm5, %%mm6     \n\t"
1308                   "pandn     40(%%edi), %%mm6 \n\t"
1309                   "por       %%mm6, %%mm7     \n\t"
1310                   "movq      %%mm7, 40(%%edi) \n\t"
1311
1312                   "addl      $48, %%esi       \n\t" // inc by 48 bytes processed
1313                   "addl      $48, %%edi       \n\t"
1314                   "subl      $8, %%ecx        \n\t" // dec by 8 pixels processed
1315
1316                   "ja        mainloop48       \n\t"
1317
1318                 "mainloop48end:               \n\t"
1319 // preload        "movl      diff, %%ecx      \n\t" // (diff is in eax)
1320                   "movl      %%eax, %%ecx     \n\t"
1321                   "cmpl      $0, %%ecx        \n\t"
1322                   "jz        end48            \n\t"
1323 // preload        "movl      mask, %%edx      \n\t"
1324                   "sall      $24, %%edx       \n\t" // make low byte, high byte
1325
1326                 "secondloop48:                \n\t"
1327                   "sall      %%edx            \n\t" // move high bit to CF
1328                   "jnc       skip48           \n\t" // if CF = 0
1329                   "movl      (%%esi), %%eax   \n\t"
1330                   "movl      %%eax, (%%edi)   \n\t"
1331
1332                 "skip48:                      \n\t"
1333                   "addl      $4, %%esi        \n\t"
1334                   "addl      $4, %%edi        \n\t"
1335                   "decl      %%ecx            \n\t"
1336                   "jnz       secondloop48     \n\t"
1337
1338                 "end48:                       \n\t"
1339                   "EMMS                       \n\t" // DONE
1340
1341                   : "=a" (dummy_value_a),           // output regs (dummy)
1342                     "=d" (dummy_value_d),
1343                     "=c" (dummy_value_c),
1344                     "=S" (dummy_value_S),
1345                     "=D" (dummy_value_D)
1346
1347                   : "3" (srcptr),      // esi       // input regs
1348                     "4" (dstptr),      // edi
1349                     "0" (diff),        // eax
1350 // was (unmask)     "b"    RESERVED    // ebx       // Global Offset Table idx
1351                     "2" (len),         // ecx
1352                     "1" (mask)         // edx
1353
1354 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1355                   : "%mm0", "%mm1", "%mm2", "%mm3"  // clobber list
1356                   , "%mm4", "%mm5", "%mm6", "%mm7"
1357 #endif
1358                );
1359             }
1360             else /* mmx _not supported - Use modified C routine */
1361 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
1362             {
1363                register png_uint_32 i;
1364                png_uint_32 initial_val = BPP6 * png_pass_start[png_ptr->pass];
1365                  /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1366                register int stride = BPP6 * png_pass_inc[png_ptr->pass];
1367                  /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1368                register int rep_bytes = BPP6 * png_pass_width[png_ptr->pass];
1369                  /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1370                png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1371                int diff = (int) (png_ptr->width & 7); /* amount lost */
1372                register png_uint_32 final_val = BPP6 * len;   /* GRR bugfix */
1373
1374                srcptr = png_ptr->row_buf + 1 + initial_val;
1375                dstptr = row + initial_val;
1376
1377                for (i = initial_val; i < final_val; i += stride)
1378                {
1379                   png_memcpy(dstptr, srcptr, rep_bytes);
1380                   srcptr += stride;
1381                   dstptr += stride;
1382                }
1383                if (diff)  /* number of leftover pixels:  3 for pngtest */
1384                {
1385                   final_val+=diff*BPP6;
1386                   for (; i < final_val; i += stride)
1387                   {
1388                      if (rep_bytes > (int)(final_val-i))
1389                         rep_bytes = (int)(final_val-i);
1390                      png_memcpy(dstptr, srcptr, rep_bytes);
1391                      srcptr += stride;
1392                      dstptr += stride;
1393                   }
1394                }
1395             } /* end of else (_mmx_supported) */
1396
1397             break;
1398          }       /* end 48 bpp */
1399
1400          case 64:       /* png_ptr->row_info.pixel_depth */
1401          {
1402             png_bytep srcptr;
1403             png_bytep dstptr;
1404             register png_uint_32 i;
1405             png_uint_32 initial_val = BPP8 * png_pass_start[png_ptr->pass];
1406               /* png.c:  png_pass_start[] = {0, 4, 0, 2, 0, 1, 0}; */
1407             register int stride = BPP8 * png_pass_inc[png_ptr->pass];
1408               /* png.c:  png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1}; */
1409             register int rep_bytes = BPP8 * png_pass_width[png_ptr->pass];
1410               /* png.c:  png_pass_width[] = {8, 4, 4, 2, 2, 1, 1}; */
1411             png_uint_32 len = png_ptr->width &~7;  /* reduce to mult. of 8 */
1412             int diff = (int) (png_ptr->width & 7); /* amount lost */
1413             register png_uint_32 final_val = BPP8 * len;   /* GRR bugfix */
1414
1415             srcptr = png_ptr->row_buf + 1 + initial_val;
1416             dstptr = row + initial_val;
1417
1418             for (i = initial_val; i < final_val; i += stride)
1419             {
1420                png_memcpy(dstptr, srcptr, rep_bytes);
1421                srcptr += stride;
1422                dstptr += stride;
1423             }
1424             if (diff)  /* number of leftover pixels:  3 for pngtest */
1425             {
1426                final_val+=diff*BPP8;
1427                for (; i < final_val; i += stride)
1428                {
1429                   if (rep_bytes > (int)(final_val-i))
1430                      rep_bytes = (int)(final_val-i);
1431                   png_memcpy(dstptr, srcptr, rep_bytes);
1432                   srcptr += stride;
1433                   dstptr += stride;
1434                }
1435             }
1436
1437             break;
1438          }       /* end 64 bpp */
1439
1440          default: /* png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64 */
1441          {
1442             /* this should never happen */
1443             png_warning(png_ptr, "Invalid row_info.pixel_depth in pnggccrd");
1444             break;
1445          }
1446       } /* end switch (png_ptr->row_info.pixel_depth) */
1447
1448    } /* end if (non-trivial mask) */
1449
1450 } /* end png_combine_row() */
1451
1452 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1453
1454
1455
1456
1457 /*===========================================================================*/
1458 /*                                                                           */
1459 /*                 P N G _ D O _ R E A D _ I N T E R L A C E                 */
1460 /*                                                                           */
1461 /*===========================================================================*/
1462
1463 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1464 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1465
1466 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1467  * has taken place.  [GRR: what other steps come before and/or after?]
1468  */
1469
1470 void /* PRIVATE */
1471 png_do_read_interlace(png_structp png_ptr)
1472 {
1473    png_row_infop row_info = &(png_ptr->row_info);
1474    png_bytep row = png_ptr->row_buf + 1;
1475    int pass = png_ptr->pass;
1476 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1477    png_uint_32 transformations = png_ptr->transformations;
1478 #endif
1479
1480    png_debug(1, "in png_do_read_interlace (pnggccrd.c)\n");
1481
1482 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1483    if (_mmx_supported == 2) {
1484        /* this should have happened in png_init_mmx_flags() already */
1485        png_warning(png_ptr, "asm_flags may not have been initialized");
1486        png_mmx_support();
1487    }
1488 #endif
1489
1490    if (row != NULL && row_info != NULL)
1491    {
1492       png_uint_32 final_width;
1493
1494       final_width = row_info->width * png_pass_inc[pass];
1495
1496       switch (row_info->pixel_depth)
1497       {
1498          case 1:
1499          {
1500             png_bytep sp, dp;
1501             int sshift, dshift;
1502             int s_start, s_end, s_inc;
1503             png_byte v;
1504             png_uint_32 i;
1505             int j;
1506
1507             sp = row + (png_size_t)((row_info->width - 1) >> 3);
1508             dp = row + (png_size_t)((final_width - 1) >> 3);
1509 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1510             if (transformations & PNG_PACKSWAP)
1511             {
1512                sshift = (int)((row_info->width + 7) & 7);
1513                dshift = (int)((final_width + 7) & 7);
1514                s_start = 7;
1515                s_end = 0;
1516                s_inc = -1;
1517             }
1518             else
1519 #endif
1520             {
1521                sshift = 7 - (int)((row_info->width + 7) & 7);
1522                dshift = 7 - (int)((final_width + 7) & 7);
1523                s_start = 0;
1524                s_end = 7;
1525                s_inc = 1;
1526             }
1527
1528             for (i = row_info->width; i; i--)
1529             {
1530                v = (png_byte)((*sp >> sshift) & 0x1);
1531                for (j = 0; j < png_pass_inc[pass]; j++)
1532                {
1533                   *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1534                   *dp |= (png_byte)(v << dshift);
1535                   if (dshift == s_end)
1536                   {
1537                      dshift = s_start;
1538                      dp--;
1539                   }
1540                   else
1541                      dshift += s_inc;
1542                }
1543                if (sshift == s_end)
1544                {
1545                   sshift = s_start;
1546                   sp--;
1547                }
1548                else
1549                   sshift += s_inc;
1550             }
1551             break;
1552          }
1553
1554          case 2:
1555          {
1556             png_bytep sp, dp;
1557             int sshift, dshift;
1558             int s_start, s_end, s_inc;
1559             png_uint_32 i;
1560
1561             sp = row + (png_size_t)((row_info->width - 1) >> 2);
1562             dp = row + (png_size_t)((final_width - 1) >> 2);
1563 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1564             if (transformations & PNG_PACKSWAP)
1565             {
1566                sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1567                dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1568                s_start = 6;
1569                s_end = 0;
1570                s_inc = -2;
1571             }
1572             else
1573 #endif
1574             {
1575                sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1576                dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1577                s_start = 0;
1578                s_end = 6;
1579                s_inc = 2;
1580             }
1581
1582             for (i = row_info->width; i; i--)
1583             {
1584                png_byte v;
1585                int j;
1586
1587                v = (png_byte)((*sp >> sshift) & 0x3);
1588                for (j = 0; j < png_pass_inc[pass]; j++)
1589                {
1590                   *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1591                   *dp |= (png_byte)(v << dshift);
1592                   if (dshift == s_end)
1593                   {
1594                      dshift = s_start;
1595                      dp--;
1596                   }
1597                   else
1598                      dshift += s_inc;
1599                }
1600                if (sshift == s_end)
1601                {
1602                   sshift = s_start;
1603                   sp--;
1604                }
1605                else
1606                   sshift += s_inc;
1607             }
1608             break;
1609          }
1610
1611          case 4:
1612          {
1613             png_bytep sp, dp;
1614             int sshift, dshift;
1615             int s_start, s_end, s_inc;
1616             png_uint_32 i;
1617
1618             sp = row + (png_size_t)((row_info->width - 1) >> 1);
1619             dp = row + (png_size_t)((final_width - 1) >> 1);
1620 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1621             if (transformations & PNG_PACKSWAP)
1622             {
1623                sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1624                dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1625                s_start = 4;
1626                s_end = 0;
1627                s_inc = -4;
1628             }
1629             else
1630 #endif
1631             {
1632                sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1633                dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1634                s_start = 0;
1635                s_end = 4;
1636                s_inc = 4;
1637             }
1638
1639             for (i = row_info->width; i; i--)
1640             {
1641                png_byte v;
1642                int j;
1643
1644                v = (png_byte)((*sp >> sshift) & 0xf);
1645                for (j = 0; j < png_pass_inc[pass]; j++)
1646                {
1647                   *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1648                   *dp |= (png_byte)(v << dshift);
1649                   if (dshift == s_end)
1650                   {
1651                      dshift = s_start;
1652                      dp--;
1653                   }
1654                   else
1655                      dshift += s_inc;
1656                }
1657                if (sshift == s_end)
1658                {
1659                   sshift = s_start;
1660                   sp--;
1661                }
1662                else
1663                   sshift += s_inc;
1664             }
1665             break;
1666          }
1667
1668        /*====================================================================*/
1669
1670          default: /* 8-bit or larger (this is where the routine is modified) */
1671          {
1672 #if 0
1673 //          static unsigned long long _const4 = 0x0000000000FFFFFFLL;  no good
1674 //          static unsigned long long const4 = 0x0000000000FFFFFFLL;   no good
1675 //          unsigned long long _const4 = 0x0000000000FFFFFFLL;         no good
1676 //          unsigned long long const4 = 0x0000000000FFFFFFLL;          no good
1677 #endif
1678             png_bytep sptr, dp;
1679             png_uint_32 i;
1680             png_size_t pixel_bytes;
1681             int width = (int)row_info->width;
1682
1683             pixel_bytes = (row_info->pixel_depth >> 3);
1684
1685             /* point sptr at the last pixel in the pre-expanded row: */
1686             sptr = row + (width - 1) * pixel_bytes;
1687
1688             /* point dp at the last pixel position in the expanded row: */
1689             dp = row + (final_width - 1) * pixel_bytes;
1690
1691             /* New code by Nirav Chhatrapati - Intel Corporation */
1692
1693 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
1694             if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_INTERLACE)
1695                 /* && _mmx_supported */ )
1696             {
1697                //--------------------------------------------------------------
1698                if (pixel_bytes == 3)
1699                {
1700                   if (((pass == 0) || (pass == 1)) && width)
1701                   {
1702                      int dummy_value_c;   // fix 'forbidden register spilled'
1703                      int dummy_value_S;
1704                      int dummy_value_D;
1705
1706                      __asm__ __volatile__ (
1707                         "subl $21, %%edi         \n\t"
1708                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1709
1710                      ".loop3_pass0:              \n\t"
1711                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1712                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1713                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1714                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1715                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1716                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1717                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1718                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1719                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1720                         "movq %%mm0, %%mm3       \n\t" // 2 1 0 2 1 0 2 1
1721                         "psllq $16, %%mm0        \n\t" // 0 2 1 0 2 1 z z
1722                         "movq %%mm3, %%mm4       \n\t" // 2 1 0 2 1 0 2 1
1723                         "punpckhdq %%mm0, %%mm3  \n\t" // 0 2 1 0 2 1 0 2
1724                         "movq %%mm4, 16(%%edi)   \n\t"
1725                         "psrlq $32, %%mm0        \n\t" // z z z z 0 2 1 0
1726                         "movq %%mm3, 8(%%edi)    \n\t"
1727                         "punpckldq %%mm4, %%mm0  \n\t" // 1 0 2 1 0 2 1 0
1728                         "subl $3, %%esi          \n\t"
1729                         "movq %%mm0, (%%edi)     \n\t"
1730                         "subl $24, %%edi         \n\t"
1731                         "decl %%ecx              \n\t"
1732                         "jnz .loop3_pass0        \n\t"
1733                         "EMMS                    \n\t" // DONE
1734
1735                         : "=c" (dummy_value_c),        // output regs (dummy)
1736                           "=S" (dummy_value_S),
1737                           "=D" (dummy_value_D)
1738
1739                         : "1" (sptr),      // esi      // input regs
1740                           "2" (dp),        // edi
1741                           "0" (width)      // ecx
1742 // doesn't work           "i" (0x0000000000FFFFFFLL)   // %1 (a.k.a. _const4)
1743
1744 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1745                         : "%mm0", "%mm1", "%mm2"       // clobber list
1746                         , "%mm3", "%mm4"
1747 #endif
1748                      );
1749                   }
1750                   else if (((pass == 2) || (pass == 3)) && width)
1751                   {
1752                      int dummy_value_c;   // fix 'forbidden register spilled'
1753                      int dummy_value_S;
1754                      int dummy_value_D;
1755
1756                      __asm__ __volatile__ (
1757                         "subl $9, %%edi          \n\t"
1758                                      // (png_pass_inc[pass] - 1)*pixel_bytes
1759
1760                      ".loop3_pass2:              \n\t"
1761                         "movd (%%esi), %%mm0     \n\t" // x x x x x 2 1 0
1762                         "pand _const4, %%mm0     \n\t" // z z z z z 2 1 0
1763                         "movq %%mm0, %%mm1       \n\t" // z z z z z 2 1 0
1764                         "psllq $16, %%mm0        \n\t" // z z z 2 1 0 z z
1765                         "movq %%mm0, %%mm2       \n\t" // z z z 2 1 0 z z
1766                         "psllq $24, %%mm0        \n\t" // 2 1 0 z z z z z
1767                         "psrlq $8, %%mm1         \n\t" // z z z z z z 2 1
1768                         "por %%mm2, %%mm0        \n\t" // 2 1 0 2 1 0 z z
1769                         "por %%mm1, %%mm0        \n\t" // 2 1 0 2 1 0 2 1
1770                         "movq %%mm0, 4(%%edi)    \n\t"
1771                         "psrlq $16, %%mm0        \n\t" // z z 2 1 0 2 1 0
1772                         "subl $3, %%esi          \n\t"
1773                         "movd %%mm0, (%%edi)     \n\t"
1774                         "subl $12, %%edi         \n\t"
1775                         "decl %%ecx              \n\t"
1776                         "jnz .loop3_pass2        \n\t"
1777                         "EMMS                    \n\t" // DONE
1778
1779                         : "=c" (dummy_value_c),        // output regs (dummy)
1780                           "=S" (dummy_value_S),
1781                           "=D" (dummy_value_D)
1782
1783                         : "1" (sptr),      // esi      // input regs
1784                           "2" (dp),        // edi
1785                           "0" (width)      // ecx
1786
1787 #if 0  /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1788                         : "%mm0", "%mm1", "%mm2"       // clobber list
1789 #endif
1790                      );
1791                   }
1792                   else if (width) /* && ((pass == 4) || (pass == 5)) */
1793                   {
1794                      int width_mmx = ((width >> 1) << 1) - 8;   // GRR:  huh?
1795                      if (width_mmx < 0)
1796                          width_mmx = 0;
1797                      width -= width_mmx;        // 8 or 9 pix, 24 or 27 bytes
1798                      if (width_mmx)
1799                      {
1800                         // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1801                         // sptr points at last pixel in pre-expanded row
1802                         // dp points at last pixel position in expanded row
1803                         int dummy_value_c;  // fix 'forbidden register spilled'
1804                         int dummy_value_S;
1805                         int dummy_value_D;
1806
1807                         __asm__ __volatile__ (
1808                            "subl $3, %%esi          \n\t"
1809                            "subl $9, %%edi          \n\t"
1810                                         // (png_pass_inc[pass] + 1)*pixel_bytes
1811
1812                         ".loop3_pass4:              \n\t"
1813                            "movq (%%esi), %%mm0     \n\t" // x x 5 4 3 2 1 0
1814                            "movq %%mm0, %%mm1       \n\t" // x x 5 4 3 2 1 0
1815                            "movq %%mm0, %%mm2       \n\t" // x x 5 4 3 2 1 0
1816                            "psllq $24, %%mm0        \n\t" // 4 3 2 1 0 z z z
1817                            "pand _const4, %%mm1     \n\t" // z z z z z 2 1 0
1818                            "psrlq $24, %%mm2        \n\t" // z z z x x 5 4 3
1819                            "por %%mm1, %%mm0        \n\t" // 4 3 2 1 0 2 1 0
1820                            "movq %%mm2, %%mm3       \n\t" // z z z x x 5 4 3
1821                            "psllq $8, %%mm2         \n\t" // z z x x 5 4 3 z
1822                            "movq %%mm0, (%%edi)     \n\t"
1823                            "psrlq $16, %%mm3        \n\t" // z z z z z x x 5
1824                            "pand _const6, %%mm3     \n\t" // z z z z z z z 5
1825                            "por %%mm3, %%mm2        \n\t" // z z x x 5 4 3 5
1826                            "subl $6, %%esi          \n\t"
1827                            "movd %%mm2, 8(%%edi)    \n\t"
1828                            "subl $12, %%edi         \n\t"
1829                            "subl $2, %%ecx          \n\t"
1830                            "jnz .loop3_pass4        \n\t"
1831                            "EMMS                    \n\t" // DONE
1832
1833                            : "=c" (dummy_value_c),        // output regs (dummy)
1834                              "=S" (dummy_value_S),
1835                              "=D" (dummy_value_D)
1836
1837                            : "1" (sptr),      // esi      // input regs
1838                              "2" (dp),        // edi
1839                              "0" (width_mmx)  // ecx
1840
1841 #if 0  /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1842                            : "%mm0", "%mm1"               // clobber list
1843                            , "%mm2", "%mm3"
1844 #endif
1845                         );
1846                      }
1847
1848                      sptr -= width_mmx*3;
1849                      dp -= width_mmx*6;
1850                      for (i = width; i; i--)
1851                      {
1852                         png_byte v[8];
1853                         int j;
1854
1855                         png_memcpy(v, sptr, 3);
1856                         for (j = 0; j < png_pass_inc[pass]; j++)
1857                         {
1858                            png_memcpy(dp, v, 3);
1859                            dp -= 3;
1860                         }
1861                         sptr -= 3;
1862                      }
1863                   }
1864                } /* end of pixel_bytes == 3 */
1865
1866                //--------------------------------------------------------------
1867                else if (pixel_bytes == 1)
1868                {
1869                   if (((pass == 0) || (pass == 1)) && width)
1870                   {
1871                      int width_mmx = ((width >> 2) << 2);
1872                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1873                      if (width_mmx)
1874                      {
1875                         int dummy_value_c;  // fix 'forbidden register spilled'
1876                         int dummy_value_S;
1877                         int dummy_value_D;
1878
1879                         __asm__ __volatile__ (
1880                            "subl $3, %%esi          \n\t"
1881                            "subl $31, %%edi         \n\t"
1882
1883                         ".loop1_pass0:              \n\t"
1884                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1885                            "movq %%mm0, %%mm1       \n\t" // x x x x 3 2 1 0
1886                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1887                            "movq %%mm0, %%mm2       \n\t" // 3 3 2 2 1 1 0 0
1888                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1889                            "movq %%mm0, %%mm3       \n\t" // 1 1 1 1 0 0 0 0
1890                            "punpckldq %%mm0, %%mm0  \n\t" // 0 0 0 0 0 0 0 0
1891                            "punpckhdq %%mm3, %%mm3  \n\t" // 1 1 1 1 1 1 1 1
1892                            "movq %%mm0, (%%edi)     \n\t"
1893                            "punpckhwd %%mm2, %%mm2  \n\t" // 3 3 3 3 2 2 2 2
1894                            "movq %%mm3, 8(%%edi)    \n\t"
1895                            "movq %%mm2, %%mm4       \n\t" // 3 3 3 3 2 2 2 2
1896                            "punpckldq %%mm2, %%mm2  \n\t" // 2 2 2 2 2 2 2 2
1897                            "punpckhdq %%mm4, %%mm4  \n\t" // 3 3 3 3 3 3 3 3
1898                            "movq %%mm2, 16(%%edi)   \n\t"
1899                            "subl $4, %%esi          \n\t"
1900                            "movq %%mm4, 24(%%edi)   \n\t"
1901                            "subl $32, %%edi         \n\t"
1902                            "subl $4, %%ecx          \n\t"
1903                            "jnz .loop1_pass0        \n\t"
1904                            "EMMS                    \n\t" // DONE
1905
1906                            : "=c" (dummy_value_c),        // output regs (dummy)
1907                              "=S" (dummy_value_S),
1908                              "=D" (dummy_value_D)
1909
1910                            : "1" (sptr),      // esi      // input regs
1911                              "2" (dp),        // edi
1912                              "0" (width_mmx)  // ecx
1913
1914 #if 0  /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1915                            : "%mm0", "%mm1", "%mm2"       // clobber list
1916                            , "%mm3", "%mm4"
1917 #endif
1918                         );
1919                      }
1920
1921                      sptr -= width_mmx;
1922                      dp -= width_mmx*8;
1923                      for (i = width; i; i--)
1924                      {
1925                         int j;
1926
1927                        /* I simplified this part in version 1.0.4e
1928                         * here and in several other instances where
1929                         * pixel_bytes == 1  -- GR-P
1930                         *
1931                         * Original code:
1932                         *
1933                         * png_byte v[8];
1934                         * png_memcpy(v, sptr, pixel_bytes);
1935                         * for (j = 0; j < png_pass_inc[pass]; j++)
1936                         * {
1937                         *    png_memcpy(dp, v, pixel_bytes);
1938                         *    dp -= pixel_bytes;
1939                         * }
1940                         * sptr -= pixel_bytes;
1941                         *
1942                         * Replacement code is in the next three lines:
1943                         */
1944
1945                         for (j = 0; j < png_pass_inc[pass]; j++)
1946                         {
1947                            *dp-- = *sptr;
1948                         }
1949                         --sptr;
1950                      }
1951                   }
1952                   else if (((pass == 2) || (pass == 3)) && width)
1953                   {
1954                      int width_mmx = ((width >> 2) << 2);
1955                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
1956                      if (width_mmx)
1957                      {
1958                         int dummy_value_c;  // fix 'forbidden register spilled'
1959                         int dummy_value_S;
1960                         int dummy_value_D;
1961
1962                         __asm__ __volatile__ (
1963                            "subl $3, %%esi          \n\t"
1964                            "subl $15, %%edi         \n\t"
1965
1966                         ".loop1_pass2:              \n\t"
1967                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
1968                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
1969                            "movq %%mm0, %%mm1       \n\t" // 3 3 2 2 1 1 0 0
1970                            "punpcklwd %%mm0, %%mm0  \n\t" // 1 1 1 1 0 0 0 0
1971                            "punpckhwd %%mm1, %%mm1  \n\t" // 3 3 3 3 2 2 2 2
1972                            "movq %%mm0, (%%edi)     \n\t"
1973                            "subl $4, %%esi          \n\t"
1974                            "movq %%mm1, 8(%%edi)    \n\t"
1975                            "subl $16, %%edi         \n\t"
1976                            "subl $4, %%ecx          \n\t"
1977                            "jnz .loop1_pass2        \n\t"
1978                            "EMMS                    \n\t" // DONE
1979
1980                            : "=c" (dummy_value_c),        // output regs (dummy)
1981                              "=S" (dummy_value_S),
1982                              "=D" (dummy_value_D)
1983
1984                            : "1" (sptr),      // esi      // input regs
1985                              "2" (dp),        // edi
1986                              "0" (width_mmx)  // ecx
1987
1988 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1989                            : "%mm0", "%mm1"               // clobber list
1990 #endif
1991                         );
1992                      }
1993
1994                      sptr -= width_mmx;
1995                      dp -= width_mmx*4;
1996                      for (i = width; i; i--)
1997                      {
1998                         int j;
1999
2000                         for (j = 0; j < png_pass_inc[pass]; j++)
2001                         {
2002                            *dp-- = *sptr;
2003                         }
2004                         --sptr;
2005                      }
2006                   }
2007                   else if (width)  /* && ((pass == 4) || (pass == 5)) */
2008                   {
2009                      int width_mmx = ((width >> 3) << 3);
2010                      width -= width_mmx;        // 0-3 pixels => 0-3 bytes
2011                      if (width_mmx)
2012                      {
2013                         int dummy_value_c;  // fix 'forbidden register spilled'
2014                         int dummy_value_S;
2015                         int dummy_value_D;
2016
2017                         __asm__ __volatile__ (
2018                            "subl $7, %%esi          \n\t"
2019                            "subl $15, %%edi         \n\t"
2020
2021                         ".loop1_pass4:              \n\t"
2022                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2023                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2024                            "punpcklbw %%mm0, %%mm0  \n\t" // 3 3 2 2 1 1 0 0
2025                            "punpckhbw %%mm1, %%mm1  \n\t" // 7 7 6 6 5 5 4 4
2026                            "movq %%mm1, 8(%%edi)    \n\t"
2027                            "subl $8, %%esi          \n\t"
2028                            "movq %%mm0, (%%edi)     \n\t"
2029                            "subl $16, %%edi         \n\t"
2030                            "subl $8, %%ecx          \n\t"
2031                            "jnz .loop1_pass4        \n\t"
2032                            "EMMS                    \n\t" // DONE
2033
2034                            : "=c" (dummy_value_c),        // output regs (none)
2035                              "=S" (dummy_value_S),
2036                              "=D" (dummy_value_D)
2037
2038                            : "1" (sptr),      // esi      // input regs
2039                              "2" (dp),        // edi
2040                              "0" (width_mmx)  // ecx
2041
2042 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2043                            : "%mm0", "%mm1"               // clobber list
2044 #endif
2045                         );
2046                      }
2047
2048                      sptr -= width_mmx;
2049                      dp -= width_mmx*2;
2050                      for (i = width; i; i--)
2051                      {
2052                         int j;
2053
2054                         for (j = 0; j < png_pass_inc[pass]; j++)
2055                         {
2056                            *dp-- = *sptr;
2057                         }
2058                         --sptr;
2059                      }
2060                   }
2061                } /* end of pixel_bytes == 1 */
2062
2063                //--------------------------------------------------------------
2064                else if (pixel_bytes == 2)
2065                {
2066                   if (((pass == 0) || (pass == 1)) && width)
2067                   {
2068                      int width_mmx = ((width >> 1) << 1);
2069                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2070                      if (width_mmx)
2071                      {
2072                         int dummy_value_c;  // fix 'forbidden register spilled'
2073                         int dummy_value_S;
2074                         int dummy_value_D;
2075
2076                         __asm__ __volatile__ (
2077                            "subl $2, %%esi          \n\t"
2078                            "subl $30, %%edi         \n\t"
2079
2080                         ".loop2_pass0:              \n\t"
2081                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2082                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2083                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2084                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2085                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2086                            "movq %%mm0, (%%edi)     \n\t"
2087                            "movq %%mm0, 8(%%edi)    \n\t"
2088                            "movq %%mm1, 16(%%edi)   \n\t"
2089                            "subl $4, %%esi          \n\t"
2090                            "movq %%mm1, 24(%%edi)   \n\t"
2091                            "subl $32, %%edi         \n\t"
2092                            "subl $2, %%ecx          \n\t"
2093                            "jnz .loop2_pass0        \n\t"
2094                            "EMMS                    \n\t" // DONE
2095
2096                            : "=c" (dummy_value_c),        // output regs (dummy)
2097                              "=S" (dummy_value_S),
2098                              "=D" (dummy_value_D)
2099
2100                            : "1" (sptr),      // esi      // input regs
2101                              "2" (dp),        // edi
2102                              "0" (width_mmx)  // ecx
2103
2104 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2105                            : "%mm0", "%mm1"               // clobber list
2106 #endif
2107                         );
2108                      }
2109
2110                      sptr -= (width_mmx*2 - 2); // sign fixed
2111                      dp -= (width_mmx*16 - 2);  // sign fixed
2112                      for (i = width; i; i--)
2113                      {
2114                         png_byte v[8];
2115                         int j;
2116                         sptr -= 2;
2117                         png_memcpy(v, sptr, 2);
2118                         for (j = 0; j < png_pass_inc[pass]; j++)
2119                         {
2120                            dp -= 2;
2121                            png_memcpy(dp, v, 2);
2122                         }
2123                      }
2124                   }
2125                   else if (((pass == 2) || (pass == 3)) && width)
2126                   {
2127                      int width_mmx = ((width >> 1) << 1) ;
2128                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2129                      if (width_mmx)
2130                      {
2131                         int dummy_value_c;  // fix 'forbidden register spilled'
2132                         int dummy_value_S;
2133                         int dummy_value_D;
2134
2135                         __asm__ __volatile__ (
2136                            "subl $2, %%esi          \n\t"
2137                            "subl $14, %%edi         \n\t"
2138
2139                         ".loop2_pass2:              \n\t"
2140                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2141                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2142                            "movq %%mm0, %%mm1       \n\t" // 3 2 3 2 1 0 1 0
2143                            "punpckldq %%mm0, %%mm0  \n\t" // 1 0 1 0 1 0 1 0
2144                            "punpckhdq %%mm1, %%mm1  \n\t" // 3 2 3 2 3 2 3 2
2145                            "movq %%mm0, (%%edi)     \n\t"
2146                            "subl $4, %%esi          \n\t"
2147                            "movq %%mm1, 8(%%edi)    \n\t"
2148                            "subl $16, %%edi         \n\t"
2149                            "subl $2, %%ecx          \n\t"
2150                            "jnz .loop2_pass2        \n\t"
2151                            "EMMS                    \n\t" // DONE
2152
2153                            : "=c" (dummy_value_c),        // output regs (dummy)
2154                              "=S" (dummy_value_S),
2155                              "=D" (dummy_value_D)
2156
2157                            : "1" (sptr),      // esi      // input regs
2158                              "2" (dp),        // edi
2159                              "0" (width_mmx)  // ecx
2160
2161 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2162                            : "%mm0", "%mm1"               // clobber list
2163 #endif
2164                         );
2165                      }
2166
2167                      sptr -= (width_mmx*2 - 2); // sign fixed
2168                      dp -= (width_mmx*8 - 2);   // sign fixed
2169                      for (i = width; i; i--)
2170                      {
2171                         png_byte v[8];
2172                         int j;
2173                         sptr -= 2;
2174                         png_memcpy(v, sptr, 2);
2175                         for (j = 0; j < png_pass_inc[pass]; j++)
2176                         {
2177                            dp -= 2;
2178                            png_memcpy(dp, v, 2);
2179                         }
2180                      }
2181                   }
2182                   else if (width)  // pass == 4 or 5
2183                   {
2184                      int width_mmx = ((width >> 1) << 1) ;
2185                      width -= width_mmx;        // 0,1 pixels => 0,2 bytes
2186                      if (width_mmx)
2187                      {
2188                         int dummy_value_c;  // fix 'forbidden register spilled'
2189                         int dummy_value_S;
2190                         int dummy_value_D;
2191
2192                         __asm__ __volatile__ (
2193                            "subl $2, %%esi          \n\t"
2194                            "subl $6, %%edi          \n\t"
2195
2196                         ".loop2_pass4:              \n\t"
2197                            "movd (%%esi), %%mm0     \n\t" // x x x x 3 2 1 0
2198                            "punpcklwd %%mm0, %%mm0  \n\t" // 3 2 3 2 1 0 1 0
2199                            "subl $4, %%esi          \n\t"
2200                            "movq %%mm0, (%%edi)     \n\t"
2201                            "subl $8, %%edi          \n\t"
2202                            "subl $2, %%ecx          \n\t"
2203                            "jnz .loop2_pass4        \n\t"
2204                            "EMMS                    \n\t" // DONE
2205
2206                            : "=c" (dummy_value_c),        // output regs (dummy)
2207                              "=S" (dummy_value_S),
2208                              "=D" (dummy_value_D)
2209
2210                            : "1" (sptr),      // esi      // input regs
2211                              "2" (dp),        // edi
2212                              "0" (width_mmx)  // ecx
2213
2214 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2215                            : "%mm0"                       // clobber list
2216 #endif
2217                         );
2218                      }
2219
2220                      sptr -= (width_mmx*2 - 2); // sign fixed
2221                      dp -= (width_mmx*4 - 2);   // sign fixed
2222                      for (i = width; i; i--)
2223                      {
2224                         png_byte v[8];
2225                         int j;
2226                         sptr -= 2;
2227                         png_memcpy(v, sptr, 2);
2228                         for (j = 0; j < png_pass_inc[pass]; j++)
2229                         {
2230                            dp -= 2;
2231                            png_memcpy(dp, v, 2);
2232                         }
2233                      }
2234                   }
2235                } /* end of pixel_bytes == 2 */
2236
2237                //--------------------------------------------------------------
2238                else if (pixel_bytes == 4)
2239                {
2240                   if (((pass == 0) || (pass == 1)) && width)
2241                   {
2242                      int width_mmx = ((width >> 1) << 1);
2243                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2244                      if (width_mmx)
2245                      {
2246                         int dummy_value_c;  // fix 'forbidden register spilled'
2247                         int dummy_value_S;
2248                         int dummy_value_D;
2249
2250                         __asm__ __volatile__ (
2251                            "subl $4, %%esi          \n\t"
2252                            "subl $60, %%edi         \n\t"
2253
2254                         ".loop4_pass0:              \n\t"
2255                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2256                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2257                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2258                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2259                            "movq %%mm0, (%%edi)     \n\t"
2260                            "movq %%mm0, 8(%%edi)    \n\t"
2261                            "movq %%mm0, 16(%%edi)   \n\t"
2262                            "movq %%mm0, 24(%%edi)   \n\t"
2263                            "movq %%mm1, 32(%%edi)   \n\t"
2264                            "movq %%mm1, 40(%%edi)   \n\t"
2265                            "movq %%mm1, 48(%%edi)   \n\t"
2266                            "subl $8, %%esi          \n\t"
2267                            "movq %%mm1, 56(%%edi)   \n\t"
2268                            "subl $64, %%edi         \n\t"
2269                            "subl $2, %%ecx          \n\t"
2270                            "jnz .loop4_pass0        \n\t"
2271                            "EMMS                    \n\t" // DONE
2272
2273                            : "=c" (dummy_value_c),        // output regs (dummy)
2274                              "=S" (dummy_value_S),
2275                              "=D" (dummy_value_D)
2276
2277                            : "1" (sptr),      // esi      // input regs
2278                              "2" (dp),        // edi
2279                              "0" (width_mmx)  // ecx
2280
2281 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2282                            : "%mm0", "%mm1"               // clobber list
2283 #endif
2284                         );
2285                      }
2286
2287                      sptr -= (width_mmx*4 - 4); // sign fixed
2288                      dp -= (width_mmx*32 - 4);  // sign fixed
2289                      for (i = width; i; i--)
2290                      {
2291                         png_byte v[8];
2292                         int j;
2293                         sptr -= 4;
2294                         png_memcpy(v, sptr, 4);
2295                         for (j = 0; j < png_pass_inc[pass]; j++)
2296                         {
2297                            dp -= 4;
2298                            png_memcpy(dp, v, 4);
2299                         }
2300                      }
2301                   }
2302                   else if (((pass == 2) || (pass == 3)) && width)
2303                   {
2304                      int width_mmx = ((width >> 1) << 1);
2305                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2306                      if (width_mmx)
2307                      {
2308                         int dummy_value_c;  // fix 'forbidden register spilled'
2309                         int dummy_value_S;
2310                         int dummy_value_D;
2311
2312                         __asm__ __volatile__ (
2313                            "subl $4, %%esi          \n\t"
2314                            "subl $28, %%edi         \n\t"
2315
2316                         ".loop4_pass2:              \n\t"
2317                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2318                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2319                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2320                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2321                            "movq %%mm0, (%%edi)     \n\t"
2322                            "movq %%mm0, 8(%%edi)    \n\t"
2323                            "movq %%mm1, 16(%%edi)   \n\t"
2324                            "movq %%mm1, 24(%%edi)   \n\t"
2325                            "subl $8, %%esi          \n\t"
2326                            "subl $32, %%edi         \n\t"
2327                            "subl $2, %%ecx          \n\t"
2328                            "jnz .loop4_pass2        \n\t"
2329                            "EMMS                    \n\t" // DONE
2330
2331                            : "=c" (dummy_value_c),        // output regs (dummy)
2332                              "=S" (dummy_value_S),
2333                              "=D" (dummy_value_D)
2334
2335                            : "1" (sptr),      // esi      // input regs
2336                              "2" (dp),        // edi
2337                              "0" (width_mmx)  // ecx
2338
2339 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2340                            : "%mm0", "%mm1"               // clobber list
2341 #endif
2342                         );
2343                      }
2344
2345                      sptr -= (width_mmx*4 - 4); // sign fixed
2346                      dp -= (width_mmx*16 - 4);  // sign fixed
2347                      for (i = width; i; i--)
2348                      {
2349                         png_byte v[8];
2350                         int j;
2351                         sptr -= 4;
2352                         png_memcpy(v, sptr, 4);
2353                         for (j = 0; j < png_pass_inc[pass]; j++)
2354                         {
2355                            dp -= 4;
2356                            png_memcpy(dp, v, 4);
2357                         }
2358                      }
2359                   }
2360                   else if (width)  // pass == 4 or 5
2361                   {
2362                      int width_mmx = ((width >> 1) << 1) ;
2363                      width -= width_mmx;        // 0,1 pixels => 0,4 bytes
2364                      if (width_mmx)
2365                      {
2366                         int dummy_value_c;  // fix 'forbidden register spilled'
2367                         int dummy_value_S;
2368                         int dummy_value_D;
2369
2370                         __asm__ __volatile__ (
2371                            "subl $4, %%esi          \n\t"
2372                            "subl $12, %%edi         \n\t"
2373
2374                         ".loop4_pass4:              \n\t"
2375                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2376                            "movq %%mm0, %%mm1       \n\t" // 7 6 5 4 3 2 1 0
2377                            "punpckldq %%mm0, %%mm0  \n\t" // 3 2 1 0 3 2 1 0
2378                            "punpckhdq %%mm1, %%mm1  \n\t" // 7 6 5 4 7 6 5 4
2379                            "movq %%mm0, (%%edi)     \n\t"
2380                            "subl $8, %%esi          \n\t"
2381                            "movq %%mm1, 8(%%edi)    \n\t"
2382                            "subl $16, %%edi         \n\t"
2383                            "subl $2, %%ecx          \n\t"
2384                            "jnz .loop4_pass4        \n\t"
2385                            "EMMS                    \n\t" // DONE
2386
2387                            : "=c" (dummy_value_c),        // output regs (dummy)
2388                              "=S" (dummy_value_S),
2389                              "=D" (dummy_value_D)
2390
2391                            : "1" (sptr),      // esi      // input regs
2392                              "2" (dp),        // edi
2393                              "0" (width_mmx)  // ecx
2394
2395 #if 0  /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2396                            : "%mm0", "%mm1"               // clobber list
2397 #endif
2398                         );
2399                      }
2400
2401                      sptr -= (width_mmx*4 - 4); // sign fixed
2402                      dp -= (width_mmx*8 - 4);   // sign fixed
2403                      for (i = width; i; i--)
2404                      {
2405                         png_byte v[8];
2406                         int j;
2407                         sptr -= 4;
2408                         png_memcpy(v, sptr, 4);
2409                         for (j = 0; j < png_pass_inc[pass]; j++)
2410                         {
2411                            dp -= 4;
2412                            png_memcpy(dp, v, 4);
2413                         }
2414                      }
2415                   }
2416                } /* end of pixel_bytes == 4 */
2417
2418                //--------------------------------------------------------------
2419                else if (pixel_bytes == 8)
2420                {
2421 // GRR TEST:  should work, but needs testing (special 64-bit version of rpng2?)
2422                   // GRR NOTE:  no need to combine passes here!
2423                   if (((pass == 0) || (pass == 1)) && width)
2424                   {
2425                      int dummy_value_c;  // fix 'forbidden register spilled'
2426                      int dummy_value_S;
2427                      int dummy_value_D;
2428
2429                      // source is 8-byte RRGGBBAA
2430                      // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2431                      __asm__ __volatile__ (
2432                         "subl $56, %%edi         \n\t" // start of last block
2433
2434                      ".loop8_pass0:              \n\t"
2435                         "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2436                         "movq %%mm0, (%%edi)     \n\t"
2437                         "movq %%mm0, 8(%%edi)    \n\t"
2438                         "movq %%mm0, 16(%%edi)   \n\t"
2439                         "movq %%mm0, 24(%%edi)   \n\t"
2440                         "movq %%mm0, 32(%%edi)   \n\t"
2441                         "movq %%mm0, 40(%%edi)   \n\t"
2442                         "movq %%mm0, 48(%%edi)   \n\t"
2443                         "subl $8, %%esi          \n\t"
2444                         "movq %%mm0, 56(%%edi)   \n\t"
2445                         "subl $64, %%edi         \n\t"
2446                         "decl %%ecx              \n\t"
2447                         "jnz .loop8_pass0        \n\t"
2448                         "EMMS                    \n\t" // DONE
2449
2450                         : "=c" (dummy_value_c),        // output regs (dummy)
2451                           "=S" (dummy_value_S),
2452                           "=D" (dummy_value_D)
2453
2454                         : "1" (sptr),      // esi      // input regs
2455                           "2" (dp),        // edi
2456                           "0" (width)      // ecx
2457
2458 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2459                         : "%mm0"                       // clobber list
2460 #endif
2461                      );
2462                   }
2463                   else if (((pass == 2) || (pass == 3)) && width)
2464                   {
2465                      // source is 8-byte RRGGBBAA
2466                      // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2467                      int width_mmx = ((width >> 1) << 1) ;
2468                      width -= width_mmx;
2469                      if (width_mmx)
2470                      {
2471                         int dummy_value_c;  // fix 'forbidden register spilled'
2472                         int dummy_value_S;
2473                         int dummy_value_D;
2474
2475                         __asm__ __volatile__ (
2476                            "subl $24, %%edi         \n\t" // start of last block
2477
2478                         ".loop8_pass2:              \n\t"
2479                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2480                            "movq %%mm0, (%%edi)     \n\t"
2481                            "movq %%mm0, 8(%%edi)    \n\t"
2482                            "movq %%mm0, 16(%%edi)   \n\t"
2483                            "subl $8, %%esi          \n\t"
2484                            "movq %%mm0, 24(%%edi)   \n\t"
2485                            "subl $32, %%edi         \n\t"
2486                            "decl %%ecx              \n\t"
2487                            "jnz .loop8_pass2        \n\t"
2488                            "EMMS                    \n\t" // DONE
2489
2490                            : "=c" (dummy_value_c),        // output regs (dummy)
2491                              "=S" (dummy_value_S),
2492                              "=D" (dummy_value_D)
2493
2494                            : "1" (sptr),      // esi      // input regs
2495                              "2" (dp),        // edi
2496                              "0" (width)      // ecx
2497
2498 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2499                            : "%mm0"                       // clobber list
2500 #endif
2501                         );
2502                      }
2503                   }
2504                   else if (width)  // pass == 4 or 5
2505                   {
2506                      // source is 8-byte RRGGBBAA
2507                      // dest is 16-byte RRGGBBAA RRGGBBAA
2508                      int width_mmx = ((width >> 1) << 1) ;
2509                      width -= width_mmx;
2510                      if (width_mmx)
2511                      {
2512                         int dummy_value_c;  // fix 'forbidden register spilled'
2513                         int dummy_value_S;
2514                         int dummy_value_D;
2515
2516                         __asm__ __volatile__ (
2517                            "subl $8, %%edi          \n\t" // start of last block
2518
2519                         ".loop8_pass4:              \n\t"
2520                            "movq (%%esi), %%mm0     \n\t" // 7 6 5 4 3 2 1 0
2521                            "movq %%mm0, (%%edi)     \n\t"
2522                            "subl $8, %%esi          \n\t"
2523                            "movq %%mm0, 8(%%edi)    \n\t"
2524                            "subl $16, %%edi         \n\t"
2525                            "decl %%ecx              \n\t"
2526                            "jnz .loop8_pass4        \n\t"
2527                            "EMMS                    \n\t" // DONE
2528
2529                            : "=c" (dummy_value_c),        // output regs (dummy)
2530                              "=S" (dummy_value_S),
2531                              "=D" (dummy_value_D)
2532
2533                            : "1" (sptr),      // esi      // input regs
2534                              "2" (dp),        // edi
2535                              "0" (width)      // ecx
2536
2537 #if 0  /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2538                            : "%mm0"                       // clobber list
2539 #endif
2540                         );
2541                      }
2542                   }
2543
2544                } /* end of pixel_bytes == 8 */
2545
2546                //--------------------------------------------------------------
2547                else if (pixel_bytes == 6)
2548                {
2549                   for (i = width; i; i--)
2550                   {
2551                      png_byte v[8];
2552                      int j;
2553                      png_memcpy(v, sptr, 6);
2554                      for (j = 0; j < png_pass_inc[pass]; j++)
2555                      {
2556                         png_memcpy(dp, v, 6);
2557                         dp -= 6;
2558                      }
2559                      sptr -= 6;
2560                   }
2561                } /* end of pixel_bytes == 6 */
2562
2563                //--------------------------------------------------------------
2564                else
2565                {
2566                   for (i = width; i; i--)
2567                   {
2568                      png_byte v[8];
2569                      int j;
2570                      png_memcpy(v, sptr, pixel_bytes);
2571                      for (j = 0; j < png_pass_inc[pass]; j++)
2572                      {
2573                         png_memcpy(dp, v, pixel_bytes);
2574                         dp -= pixel_bytes;
2575                      }
2576                      sptr-= pixel_bytes;
2577                   }
2578                }
2579             } // end of _mmx_supported ========================================
2580
2581             else /* MMX not supported:  use modified C code - takes advantage
2582                   *   of inlining of png_memcpy for a constant */
2583                  /* GRR 19991007:  does it?  or should pixel_bytes in each
2584                   *   block be replaced with immediate value (e.g., 1)? */
2585                  /* GRR 19991017:  replaced with constants in each case */
2586 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
2587             {
2588                if (pixel_bytes == 1)
2589                {
2590                   for (i = width; i; i--)
2591                   {
2592                      int j;
2593                      for (j = 0; j < png_pass_inc[pass]; j++)
2594                      {
2595                         *dp-- = *sptr;
2596                      }
2597                      --sptr;
2598                   }
2599                }
2600                else if (pixel_bytes == 3)
2601                {
2602                   for (i = width; i; i--)
2603                   {
2604                      png_byte v[8];
2605                      int j;
2606                      png_memcpy(v, sptr, 3);
2607                      for (j = 0; j < png_pass_inc[pass]; j++)
2608                      {
2609                         png_memcpy(dp, v, 3);
2610                         dp -= 3;
2611                      }
2612                      sptr -= 3;
2613                   }
2614                }
2615                else if (pixel_bytes == 2)
2616                {
2617                   for (i = width; i; i--)
2618                   {
2619                      png_byte v[8];
2620                      int j;
2621                      png_memcpy(v, sptr, 2);
2622                      for (j = 0; j < png_pass_inc[pass]; j++)
2623                      {
2624                         png_memcpy(dp, v, 2);
2625                         dp -= 2;
2626                      }
2627                      sptr -= 2;
2628                   }
2629                }
2630                else if (pixel_bytes == 4)
2631                {
2632                   for (i = width; i; i--)
2633                   {
2634                      png_byte v[8];
2635                      int j;
2636                      png_memcpy(v, sptr, 4);
2637                      for (j = 0; j < png_pass_inc[pass]; j++)
2638                      {
2639 #ifdef PNG_DEBUG
2640                         if (dp < row || dp+3 > row+png_ptr->row_buf_size)
2641                         {
2642                            printf("dp out of bounds: row=%d, dp=%d, rp=%d\n",
2643                              row, dp, row+png_ptr->row_buf_size);
2644                            printf("row_buf=%d\n",png_ptr->row_buf_size);
2645                         }
2646 #endif
2647                         png_memcpy(dp, v, 4);
2648                         dp -= 4;
2649                      }
2650                      sptr -= 4;
2651                   }
2652                }
2653                else if (pixel_bytes == 6)
2654                {
2655                   for (i = width; i; i--)
2656                   {
2657                      png_byte v[8];
2658                      int j;
2659                      png_memcpy(v, sptr, 6);
2660                      for (j = 0; j < png_pass_inc[pass]; j++)
2661                      {
2662                         png_memcpy(dp, v, 6);
2663                         dp -= 6;
2664                      }
2665                      sptr -= 6;
2666                   }
2667                }
2668                else if (pixel_bytes == 8)
2669                {
2670                   for (i = width; i; i--)
2671                   {
2672                      png_byte v[8];
2673                      int j;
2674                      png_memcpy(v, sptr, 8);
2675                      for (j = 0; j < png_pass_inc[pass]; j++)
2676                      {
2677                         png_memcpy(dp, v, 8);
2678                         dp -= 8;
2679                      }
2680                      sptr -= 8;
2681                   }
2682                }
2683                else     /* GRR:  should never be reached */
2684                {
2685                   for (i = width; i; i--)
2686                   {
2687                      png_byte v[8];
2688                      int j;
2689                      png_memcpy(v, sptr, pixel_bytes);
2690                      for (j = 0; j < png_pass_inc[pass]; j++)
2691                      {
2692                         png_memcpy(dp, v, pixel_bytes);
2693                         dp -= pixel_bytes;
2694                      }
2695                      sptr -= pixel_bytes;
2696                   }
2697                }
2698
2699             } /* end if (MMX not supported) */
2700             break;
2701          }
2702       } /* end switch (row_info->pixel_depth) */
2703
2704       row_info->width = final_width;
2705       row_info->rowbytes = ((final_width *
2706          (png_uint_32)row_info->pixel_depth + 7) >> 3);
2707    }
2708
2709 } /* end png_do_read_interlace() */
2710
2711 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2712 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2713
2714
2715
2716 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
2717 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
2718
2719 // These variables are utilized in the functions below.  They are declared
2720 // globally here to ensure alignment on 8-byte boundaries.
2721
2722 union uAll {
2723    long long use;
2724    double  align;
2725 } _LBCarryMask = {0x0101010101010101LL},
2726   _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2727   _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2728
2729 #ifdef PNG_THREAD_UNSAFE_OK
2730 //===========================================================================//
2731 //                                                                           //
2732 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G           //
2733 //                                                                           //
2734 //===========================================================================//
2735
2736 // Optimized code for PNG Average filter decoder
2737
2738 static void /* PRIVATE */
2739 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2740                             png_bytep prev_row)
2741 {
2742    int bpp;
2743    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
2744    int dummy_value_S;
2745    int dummy_value_D;
2746
2747    bpp = (row_info->pixel_depth + 7) >> 3;  // get # bytes per pixel
2748    _FullLength  = row_info->rowbytes;       // # of bytes to filter
2749
2750    __asm__ __volatile__ (
2751       // initialize address pointers and offset
2752 #ifdef __PIC__
2753       "pushl %%ebx                 \n\t" // save index to Global Offset Table
2754 #endif
2755 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
2756       "xorl %%ebx, %%ebx           \n\t" // ebx:  x
2757       "movl %%edi, %%edx           \n\t"
2758 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2759 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
2760       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
2761
2762       "xorl %%eax,%%eax            \n\t"
2763
2764       // Compute the Raw value for the first bpp bytes
2765       //    Raw(x) = Avg(x) + (Prior(x)/2)
2766    "avg_rlp:                       \n\t"
2767       "movb (%%esi,%%ebx,),%%al    \n\t" // load al with Prior(x)
2768       "incl %%ebx                  \n\t"
2769       "shrb %%al                   \n\t" // divide by 2
2770       "addb -1(%%edi,%%ebx,),%%al  \n\t" // add Avg(x); -1 to offset inc ebx
2771 //pre "cmpl bpp, %%ebx             \n\t" // (bpp is preloaded into ecx)
2772       "cmpl %%ecx, %%ebx           \n\t"
2773       "movb %%al,-1(%%edi,%%ebx,)  \n\t" // write Raw(x); -1 to offset inc ebx
2774       "jb avg_rlp                  \n\t" // mov does not affect flags
2775
2776       // get # of bytes to alignment
2777       "movl %%edi, _dif            \n\t" // take start of row
2778       "addl %%ebx, _dif            \n\t" // add bpp
2779       "addl $0xf, _dif             \n\t" // add 7+8 to incr past alignment bdry
2780       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
2781       "subl %%edi, _dif            \n\t" // subtract from start => value ebx at
2782       "jz avg_go                   \n\t" //  alignment
2783
2784       // fix alignment
2785       // Compute the Raw value for the bytes up to the alignment boundary
2786       //    Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2787       "xorl %%ecx, %%ecx           \n\t"
2788
2789    "avg_lp1:                       \n\t"
2790       "xorl %%eax, %%eax           \n\t"
2791       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
2792       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
2793       "addw %%cx, %%ax             \n\t"
2794       "incl %%ebx                  \n\t"
2795       "shrw %%ax                   \n\t" // divide by 2
2796       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2797       "cmpl _dif, %%ebx            \n\t" // check if at alignment boundary
2798       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2799       "jb avg_lp1                  \n\t" // repeat until at alignment boundary
2800
2801    "avg_go:                        \n\t"
2802       "movl _FullLength, %%eax     \n\t"
2803       "movl %%eax, %%ecx           \n\t"
2804       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
2805       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
2806       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
2807       "movl %%ecx, _MMXLength      \n\t"
2808 #ifdef __PIC__
2809       "popl %%ebx                  \n\t" // restore index to Global Offset Table
2810 #endif
2811
2812       : "=c" (dummy_value_c),            // output regs (dummy)
2813         "=S" (dummy_value_S),
2814         "=D" (dummy_value_D)
2815
2816       : "0" (bpp),       // ecx          // input regs
2817         "1" (prev_row),  // esi
2818         "2" (row)        // edi
2819
2820       : "%eax", "%edx"                   // clobber list
2821 #ifndef __PIC__
2822       , "%ebx"
2823 #endif
2824       // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2825       // (seems to work fine without...)
2826    );
2827
2828    // now do the math for the rest of the row
2829    switch (bpp)
2830    {
2831       case 3:
2832       {
2833          _ActiveMask.use  = 0x0000000000ffffffLL;
2834          _ShiftBpp.use = 24;    // == 3 * 8
2835          _ShiftRem.use = 40;    // == 64 - 24
2836
2837          __asm__ __volatile__ (
2838             // re-init address pointers and offset
2839             "movq _ActiveMask, %%mm7      \n\t"
2840             "movl _dif, %%ecx             \n\t" // ecx:  x = offset to
2841             "movq _LBCarryMask, %%mm5     \n\t" //  alignment boundary
2842 // preload  "movl row, %%edi              \n\t" // edi:  Avg(x)
2843             "movq _HBClearMask, %%mm4     \n\t"
2844 // preload  "movl prev_row, %%esi         \n\t" // esi:  Prior(x)
2845
2846             // prime the pump:  load the first Raw(x-bpp) data set
2847             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2848                                                 // (correct pos. in loop below)
2849          "avg_3lp:                        \n\t"
2850             "movq (%%edi,%%ecx,), %%mm0   \n\t" // load mm0 with Avg(x)
2851             "movq %%mm5, %%mm3            \n\t"
2852             "psrlq _ShiftRem, %%mm2       \n\t" // correct position Raw(x-bpp)
2853                                                 // data
2854             "movq (%%esi,%%ecx,), %%mm1   \n\t" // load mm1 with Prior(x)
2855             "movq %%mm7, %%mm6            \n\t"
2856             "pand %%mm1, %%mm3            \n\t" // get lsb for each prev_row byte
2857             "psrlq $1, %%mm1              \n\t" // divide prev_row bytes by 2
2858             "pand  %%mm4, %%mm1           \n\t" // clear invalid bit 7 of each
2859                                                 // byte
2860             "paddb %%mm1, %%mm0           \n\t" // add (Prev_row/2) to Avg for
2861                                                 // each byte
2862             // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2863             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2864                                                 // LBCarrys
2865             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2866                                                 // where both
2867                                // lsb's were == 1 (only valid for active group)
2868             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2869             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2870                                                 // byte
2871             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2872                                                 // for each byte
2873             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 1
2874                                                 // bytes to add to Avg
2875             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2876                                                 // Avg for each Active
2877                                //  byte
2878             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2879             "psllq _ShiftBpp, %%mm6       \n\t" // shift the mm6 mask to cover
2880                                                 // bytes 3-5
2881             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2882             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2883             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2884                                                 // LBCarrys
2885             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2886                                                 // where both
2887                                // lsb's were == 1 (only valid for active group)
2888             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2889             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2890                                                 // byte
2891             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2892                                                 // for each byte
2893             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2894                                                 // bytes to add to Avg
2895             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2896                                                 // Avg for each Active
2897                                //  byte
2898
2899             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2900             "psllq _ShiftBpp, %%mm6       \n\t" // shift mm6 mask to cover last
2901                                                 // two
2902                                  // bytes
2903             "movq %%mm0, %%mm2            \n\t" // mov updated Raws to mm2
2904             "psllq _ShiftBpp, %%mm2       \n\t" // shift data to pos. correctly
2905                               // Data only needs to be shifted once here to
2906                               // get the correct x-bpp offset.
2907             "movq %%mm3, %%mm1            \n\t" // now use mm1 for getting
2908                                                 // LBCarrys
2909             "pand %%mm2, %%mm1            \n\t" // get LBCarrys for each byte
2910                                                 // where both
2911                               // lsb's were == 1 (only valid for active group)
2912             "psrlq $1, %%mm2              \n\t" // divide raw bytes by 2
2913             "pand  %%mm4, %%mm2           \n\t" // clear invalid bit 7 of each
2914                                                 // byte
2915             "paddb %%mm1, %%mm2           \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2916                                                 // for each byte
2917             "pand %%mm6, %%mm2            \n\t" // leave only Active Group 2
2918                                                 // bytes to add to Avg
2919             "addl $8, %%ecx               \n\t"
2920             "paddb %%mm2, %%mm0           \n\t" // add (Raw/2) + LBCarrys to
2921                                                 // Avg for each Active
2922                                                 // byte
2923             // now ready to write back to memory
2924             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2925             // move updated Raw(x) to use as Raw(x-bpp) for next loop
2926             "cmpl _MMXLength, %%ecx       \n\t"
2927             "movq %%mm0, %%mm2            \n\t" // mov updated Raw(x) to mm2
2928             "jb avg_3lp                   \n\t"
2929
2930             : "=S" (dummy_value_S),             // output regs (dummy)
2931               "=D" (dummy_value_D)
2932
2933             : "0" (prev_row),  // esi           // input regs
2934               "1" (row)        // edi
2935
2936             : "%ecx"                            // clobber list
2937 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2938             , "%mm0", "%mm1", "%mm2", "%mm3"
2939             , "%mm4", "%mm5", "%mm6", "%mm7"
2940 #endif
2941          );
2942       }
2943       break;  // end 3 bpp
2944
2945       case 6:
2946       case 4:
2947       //case 7:   // who wrote this?  PNG doesn't support 5 or 7 bytes/pixel
2948       //case 5:   // GRR BOGUS
2949       {
2950          _ActiveMask.use  = 0xffffffffffffffffLL; // use shift below to clear
2951                                                   // appropriate inactive bytes
2952          _ShiftBpp.use = bpp << 3;
2953          _ShiftRem.use = 64 - _ShiftBpp.use;
2954
2955          __asm__ __volatile__ (
2956             "movq _HBClearMask, %%mm4    \n\t"
2957
2958             // re-init address pointers and offset
2959             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to
2960                                                // alignment boundary
2961
2962             // load _ActiveMask and clear all bytes except for 1st active group
2963             "movq _ActiveMask, %%mm7     \n\t"
2964 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
2965             "psrlq _ShiftRem, %%mm7      \n\t"
2966 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
2967             "movq %%mm7, %%mm6           \n\t"
2968             "movq _LBCarryMask, %%mm5    \n\t"
2969             "psllq _ShiftBpp, %%mm6      \n\t" // create mask for 2nd active
2970                                                // group
2971
2972             // prime the pump:  load the first Raw(x-bpp) data set
2973             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2974                                           // (we correct pos. in loop below)
2975          "avg_4lp:                       \n\t"
2976             "movq (%%edi,%%ecx,), %%mm0  \n\t"
2977             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
2978             "movq (%%esi,%%ecx,), %%mm1  \n\t"
2979             // add (Prev_row/2) to average
2980             "movq %%mm5, %%mm3           \n\t"
2981             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
2982             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
2983             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
2984                                                // byte
2985             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
2986                                                // each byte
2987             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2988             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
2989                                                // LBCarrys
2990             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
2991                                                // where both
2992                               // lsb's were == 1 (only valid for active group)
2993             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
2994             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
2995                                                // byte
2996             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
2997                                                // for each byte
2998             "pand %%mm7, %%mm2           \n\t" // leave only Active Group 1
2999                                                // bytes to add to Avg
3000             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3001                                                // for each Active
3002                               // byte
3003             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3004             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3005             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3006             "addl $8, %%ecx              \n\t"
3007             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3008                                                // LBCarrys
3009             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3010                                                // where both
3011                               // lsb's were == 1 (only valid for active group)
3012             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3013             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3014                                                // byte
3015             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3016                                                // for each byte
3017             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3018                                                // bytes to add to Avg
3019             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3020                                                // Avg for each Active
3021                               // byte
3022             "cmpl _MMXLength, %%ecx      \n\t"
3023             // now ready to write back to memory
3024             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3025             // prep Raw(x-bpp) for next loop
3026             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3027             "jb avg_4lp                  \n\t"
3028
3029             : "=S" (dummy_value_S),            // output regs (dummy)
3030               "=D" (dummy_value_D)
3031
3032             : "0" (prev_row),  // esi          // input regs
3033               "1" (row)        // edi
3034
3035             : "%ecx"                           // clobber list
3036 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3037             , "%mm0", "%mm1", "%mm2", "%mm3"
3038             , "%mm4", "%mm5", "%mm6", "%mm7"
3039 #endif
3040          );
3041       }
3042       break;  // end 4,6 bpp
3043
3044       case 2:
3045       {
3046          _ActiveMask.use  = 0x000000000000ffffLL;
3047          _ShiftBpp.use = 16;   // == 2 * 8
3048          _ShiftRem.use = 48;   // == 64 - 16
3049
3050          __asm__ __volatile__ (
3051             // load _ActiveMask
3052             "movq _ActiveMask, %%mm7     \n\t"
3053             // re-init address pointers and offset
3054             "movl _dif, %%ecx            \n\t" // ecx:  x = offset to alignment
3055                                                // boundary
3056             "movq _LBCarryMask, %%mm5    \n\t"
3057 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3058             "movq _HBClearMask, %%mm4    \n\t"
3059 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3060
3061             // prime the pump:  load the first Raw(x-bpp) data set
3062             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3063                               // (we correct pos. in loop below)
3064          "avg_2lp:                       \n\t"
3065             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3066             "psrlq _ShiftRem, %%mm2      \n\t" // shift data to pos. correctly
3067             "movq (%%esi,%%ecx,), %%mm1  \n\t" //  (GRR BUGFIX:  was psllq)
3068             // add (Prev_row/2) to average
3069             "movq %%mm5, %%mm3           \n\t"
3070             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3071             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3072             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3073                                                // byte
3074             "movq %%mm7, %%mm6           \n\t"
3075             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3076                                                // each byte
3077
3078             // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
3079             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3080                                                // LBCarrys
3081             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3082                                                // where both
3083                                                // lsb's were == 1 (only valid
3084                                                // for active group)
3085             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3086             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3087                                                // byte
3088             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3089                                                // for each byte
3090             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 1
3091                                                // bytes to add to Avg
3092             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to Avg
3093                                                // for each Active byte
3094
3095             // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
3096             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3097                                                // bytes 2 & 3
3098             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3099             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3100             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3101                                                // LBCarrys
3102             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3103                                                // where both
3104                                                // lsb's were == 1 (only valid
3105                                                // for active group)
3106             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3107             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3108                                                // byte
3109             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3110                                                // for each byte
3111             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3112                                                // bytes to add to Avg
3113             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3114                                                // Avg for each Active byte
3115
3116             // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
3117             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3118                                                // bytes 4 & 5
3119             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3120             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3121             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3122                                                // LBCarrys
3123             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3124                                                // where both lsb's were == 1
3125                                                // (only valid for active group)
3126             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3127             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3128                                                // byte
3129             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3130                                                // for each byte
3131             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3132                                                // bytes to add to Avg
3133             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3134                                                // Avg for each Active byte
3135
3136             // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
3137             "psllq _ShiftBpp, %%mm6      \n\t" // shift the mm6 mask to cover
3138                                                // bytes 6 & 7
3139             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3140             "psllq _ShiftBpp, %%mm2      \n\t" // shift data to pos. correctly
3141             "addl $8, %%ecx              \n\t"
3142             "movq %%mm3, %%mm1           \n\t" // now use mm1 for getting
3143                                                // LBCarrys
3144             "pand %%mm2, %%mm1           \n\t" // get LBCarrys for each byte
3145                                                // where both
3146                                                // lsb's were == 1 (only valid
3147                                                // for active group)
3148             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3149             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3150                                                // byte
3151             "paddb %%mm1, %%mm2          \n\t" // add LBCarrys to (Raw(x-bpp)/2)
3152                                                // for each byte
3153             "pand %%mm6, %%mm2           \n\t" // leave only Active Group 2
3154                                                // bytes to add to Avg
3155             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) + LBCarrys to
3156                                                // Avg for each Active byte
3157
3158             "cmpl _MMXLength, %%ecx      \n\t"
3159             // now ready to write back to memory
3160             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3161             // prep Raw(x-bpp) for next loop
3162             "movq %%mm0, %%mm2           \n\t" // mov updated Raws to mm2
3163             "jb avg_2lp                  \n\t"
3164
3165             : "=S" (dummy_value_S),            // output regs (dummy)
3166               "=D" (dummy_value_D)
3167
3168             : "0" (prev_row),  // esi          // input regs
3169               "1" (row)        // edi
3170
3171             : "%ecx"                           // clobber list
3172 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3173             , "%mm0", "%mm1", "%mm2", "%mm3"
3174             , "%mm4", "%mm5", "%mm6", "%mm7"
3175 #endif
3176          );
3177       }
3178       break;  // end 2 bpp
3179
3180       case 1:
3181       {
3182          __asm__ __volatile__ (
3183             // re-init address pointers and offset
3184 #ifdef __PIC__
3185             "pushl %%ebx                 \n\t" // save Global Offset Table index
3186 #endif
3187             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to alignment
3188                                                // boundary
3189 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3190             "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3191             "jnb avg_1end                \n\t"
3192             // do Paeth decode for remaining bytes
3193 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3194             "movl %%edi, %%edx           \n\t"
3195 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3196             "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3197             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
3198                                                //  in loop below
3199          "avg_1lp:                       \n\t"
3200             // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3201             "xorl %%eax, %%eax           \n\t"
3202             "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3203             "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3204             "addw %%cx, %%ax             \n\t"
3205             "incl %%ebx                  \n\t"
3206             "shrw %%ax                   \n\t" // divide by 2
3207             "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset
3208                                                // inc ebx
3209             "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3210             "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
3211                          // mov does not affect flags; -1 to offset inc ebx
3212             "jb avg_1lp                  \n\t"
3213
3214          "avg_1end:                      \n\t"
3215 #ifdef __PIC__
3216             "popl %%ebx                  \n\t" // Global Offset Table index
3217 #endif
3218
3219             : "=c" (dummy_value_c),            // output regs (dummy)
3220               "=S" (dummy_value_S),
3221               "=D" (dummy_value_D)
3222
3223             : "0" (bpp),       // ecx          // input regs
3224               "1" (prev_row),  // esi
3225               "2" (row)        // edi
3226
3227             : "%eax", "%edx"                   // clobber list
3228 #ifndef __PIC__
3229             , "%ebx"
3230 #endif
3231          );
3232       }
3233       return;  // end 1 bpp
3234
3235       case 8:
3236       {
3237          __asm__ __volatile__ (
3238             // re-init address pointers and offset
3239             "movl _dif, %%ecx            \n\t" // ecx:  x == offset to alignment
3240             "movq _LBCarryMask, %%mm5    \n\t" //            boundary
3241 // preload  "movl row, %%edi             \n\t" // edi:  Avg(x)
3242             "movq _HBClearMask, %%mm4    \n\t"
3243 // preload  "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3244
3245             // prime the pump:  load the first Raw(x-bpp) data set
3246             "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
3247                                       // (NO NEED to correct pos. in loop below)
3248
3249          "avg_8lp:                       \n\t"
3250             "movq (%%edi,%%ecx,), %%mm0  \n\t"
3251             "movq %%mm5, %%mm3           \n\t"
3252             "movq (%%esi,%%ecx,), %%mm1  \n\t"
3253             "addl $8, %%ecx              \n\t"
3254             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3255             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3256             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3257                                                //  where both lsb's were == 1
3258             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3259             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7, each byte
3260             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg, each byte
3261             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7, each byte
3262             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg, each
3263             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3264             "cmpl _MMXLength, %%ecx      \n\t"
3265             "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3266             "movq %%mm0, %%mm2           \n\t" // reuse as Raw(x-bpp)
3267             "jb avg_8lp                  \n\t"
3268
3269             : "=S" (dummy_value_S),            // output regs (dummy)
3270               "=D" (dummy_value_D)
3271
3272             : "0" (prev_row),  // esi          // input regs
3273               "1" (row)        // edi
3274
3275             : "%ecx"                           // clobber list
3276 #if 0  /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3277             , "%mm0", "%mm1", "%mm2"
3278             , "%mm3", "%mm4", "%mm5"
3279 #endif
3280          );
3281       }
3282       break;  // end 8 bpp
3283
3284       default:                  // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3285       {
3286
3287 #ifdef PNG_DEBUG
3288          // GRR:  PRINT ERROR HERE:  SHOULD NEVER BE REACHED
3289         png_debug(1,
3290         "Internal logic error in pnggccrd (png_read_filter_row_mmx_avg())\n");
3291 #endif
3292
3293 #if 0
3294         __asm__ __volatile__ (
3295             "movq _LBCarryMask, %%mm5    \n\t"
3296             // re-init address pointers and offset
3297             "movl _dif, %%ebx            \n\t" // ebx:  x = offset to
3298                                                // alignment boundary
3299             "movl row, %%edi             \n\t" // edi:  Avg(x)
3300             "movq _HBClearMask, %%mm4    \n\t"
3301             "movl %%edi, %%edx           \n\t"
3302             "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3303             "subl bpp, %%edx             \n\t" // edx:  Raw(x-bpp)
3304          "avg_Alp:                       \n\t"
3305             "movq (%%edi,%%ebx,), %%mm0  \n\t"
3306             "movq %%mm5, %%mm3           \n\t"
3307             "movq (%%esi,%%ebx,), %%mm1  \n\t"
3308             "pand %%mm1, %%mm3           \n\t" // get lsb for each prev_row byte
3309             "movq (%%edx,%%ebx,), %%mm2  \n\t"
3310             "psrlq $1, %%mm1             \n\t" // divide prev_row bytes by 2
3311             "pand %%mm2, %%mm3           \n\t" // get LBCarrys for each byte
3312                                                // where both lsb's were == 1
3313             "psrlq $1, %%mm2             \n\t" // divide raw bytes by 2
3314             "pand  %%mm4, %%mm1          \n\t" // clear invalid bit 7 of each
3315                                                // byte
3316             "paddb %%mm3, %%mm0          \n\t" // add LBCarrys to Avg for each
3317                                                // byte
3318             "pand  %%mm4, %%mm2          \n\t" // clear invalid bit 7 of each
3319                                                // byte
3320             "paddb %%mm1, %%mm0          \n\t" // add (Prev_row/2) to Avg for
3321                                                // each byte
3322             "addl $8, %%ebx              \n\t"
3323             "paddb %%mm2, %%mm0          \n\t" // add (Raw/2) to Avg for each
3324                                                // byte
3325             "cmpl _MMXLength, %%ebx      \n\t"
3326             "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3327             "jb avg_Alp                  \n\t"
3328
3329             : // FIXASM: output regs/vars go here, e.g.:  "=m" (memory_var)
3330
3331             : // FIXASM: input regs, e.g.:  "c" (count), "S" (src), "D" (dest)
3332
3333             : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3334          );
3335 #endif /* 0 - NEVER REACHED */
3336       }
3337       break;
3338
3339    } // end switch (bpp)
3340
3341    __asm__ __volatile__ (
3342       // MMX acceleration complete; now do clean-up
3343       // check if any remaining bytes left to decode
3344 #ifdef __PIC__
3345       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3346 #endif
3347       "movl _MMXLength, %%ebx      \n\t" // ebx:  x == offset bytes after MMX
3348 //pre "movl row, %%edi             \n\t" // edi:  Avg(x)
3349       "cmpl _FullLength, %%ebx     \n\t" // test if offset at end of array
3350       "jnb avg_end                 \n\t"
3351
3352       // do Avg decode for remaining bytes
3353 //pre "movl prev_row, %%esi        \n\t" // esi:  Prior(x)
3354       "movl %%edi, %%edx           \n\t"
3355 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
3356       "subl %%ecx, %%edx           \n\t" // edx:  Raw(x-bpp)
3357       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
3358
3359    "avg_lp2:                       \n\t"
3360       // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3361       "xorl %%eax, %%eax           \n\t"
3362       "movb (%%esi,%%ebx,), %%cl   \n\t" // load cl with Prior(x)
3363       "movb (%%edx,%%ebx,), %%al   \n\t" // load al with Raw(x-bpp)
3364       "addw %%cx, %%ax             \n\t"
3365       "incl %%ebx                  \n\t"
3366       "shrw %%ax                   \n\t" // divide by 2
3367       "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3368       "cmpl _FullLength, %%ebx     \n\t" // check if at end of array
3369       "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3370       "jb avg_lp2                  \n\t" //  affect flags; -1 to offset inc ebx]
3371
3372    "avg_end:                       \n\t"
3373       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
3374 #ifdef __PIC__
3375       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3376 #endif
3377
3378       : "=c" (dummy_value_c),            // output regs (dummy)
3379         "=S" (dummy_value_S),
3380         "=D" (dummy_value_D)
3381
3382       : "0" (bpp),       // ecx          // input regs
3383         "1" (prev_row),  // esi
3384         "2" (row)        // edi
3385
3386       : "%eax", "%edx"                   // clobber list
3387 #ifndef __PIC__
3388       , "%ebx"
3389 #endif
3390    );
3391
3392 } /* end png_read_filter_row_mmx_avg() */
3393 #endif
3394
3395
3396
3397 #ifdef PNG_THREAD_UNSAFE_OK
3398 //===========================================================================//
3399 //                                                                           //
3400 //         P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H         //
3401 //                                                                           //
3402 //===========================================================================//
3403
3404 // Optimized code for PNG Paeth filter decoder
3405
3406 static void /* PRIVATE */
3407 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3408                               png_bytep prev_row)
3409 {
3410    int bpp;
3411    int dummy_value_c;   // fix 'forbidden register 2 (cx) was spilled' error
3412    int dummy_value_S;
3413    int dummy_value_D;
3414
3415    bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3416    _FullLength  = row_info->rowbytes; // # of bytes to filter
3417
3418    __asm__ __volatile__ (
3419 #ifdef __PIC__
3420       "pushl %%ebx                 \n\t" // save index to Global Offset Table
3421 #endif
3422       "xorl %%ebx, %%ebx           \n\t" // ebx:  x offset
3423 //pre "movl row, %%edi             \n\t"
3424       "xorl %%edx, %%edx           \n\t" // edx:  x-bpp offset
3425 //pre "movl prev_row, %%esi        \n\t"
3426       "xorl %%eax, %%eax           \n\t"
3427
3428       // Compute the Raw value for the first bpp bytes
3429       // Note: the formula works out to be always
3430       //   Paeth(x) = Raw(x) + Prior(x)      where x < bpp
3431    "paeth_rlp:                     \n\t"
3432       "movb (%%edi,%%ebx,), %%al   \n\t"
3433       "addb (%%esi,%%ebx,), %%al   \n\t"
3434       "incl %%ebx                  \n\t"
3435 //pre "cmpl bpp, %%ebx             \n\t" (bpp is preloaded into ecx)
3436       "cmpl %%ecx, %%ebx           \n\t"
3437       "movb %%al, -1(%%edi,%%ebx,) \n\t"
3438       "jb paeth_rlp                \n\t"
3439       // get # of bytes to alignment
3440       "movl %%edi, _dif            \n\t" // take start of row
3441       "addl %%ebx, _dif            \n\t" // add bpp
3442       "xorl %%ecx, %%ecx           \n\t"
3443       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past alignment
3444                                          // boundary
3445       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
3446       "subl %%edi, _dif            \n\t" // subtract from start ==> value ebx
3447                                          // at alignment
3448       "jz paeth_go                 \n\t"
3449       // fix alignment
3450
3451    "paeth_lp1:                     \n\t"
3452       "xorl %%eax, %%eax           \n\t"
3453       // pav = p - a = (a + b - c) - a = b - c
3454       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
3455       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3456       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3457       "movl %%eax, _patemp         \n\t" // Save pav for later use
3458       "xorl %%eax, %%eax           \n\t"
3459       // pbv = p - b = (a + b - c) - b = a - c
3460       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
3461       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
3462       "movl %%eax, %%ecx           \n\t"
3463       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3464       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
3465       // pc = abs(pcv)
3466       "testl $0x80000000, %%eax    \n\t"
3467       "jz paeth_pca                \n\t"
3468       "negl %%eax                  \n\t" // reverse sign of neg values
3469
3470    "paeth_pca:                     \n\t"
3471       "movl %%eax, _pctemp         \n\t" // save pc for later use
3472       // pb = abs(pbv)
3473       "testl $0x80000000, %%ecx    \n\t"
3474       "jz paeth_pba                \n\t"
3475       "negl %%ecx                  \n\t" // reverse sign of neg values
3476
3477    "paeth_pba:                     \n\t"
3478       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
3479       // pa = abs(pav)
3480       "movl _patemp, %%eax         \n\t"
3481       "testl $0x80000000, %%eax    \n\t"
3482       "jz paeth_paa                \n\t"
3483       "negl %%eax                  \n\t" // reverse sign of neg values
3484
3485    "paeth_paa:                     \n\t"
3486       "movl %%eax, _patemp         \n\t" // save pa for later use
3487       // test if pa <= pb
3488       "cmpl %%ecx, %%eax           \n\t"
3489       "jna paeth_abb               \n\t"
3490       // pa > pb; now test if pb <= pc
3491       "cmpl _pctemp, %%ecx         \n\t"
3492       "jna paeth_bbc               \n\t"
3493       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3494       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3495       "jmp paeth_paeth             \n\t"
3496
3497    "paeth_bbc:                     \n\t"
3498       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3499       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
3500       "jmp paeth_paeth             \n\t"
3501
3502    "paeth_abb:                     \n\t"
3503       // pa <= pb; now test if pa <= pc
3504       "cmpl _pctemp, %%eax         \n\t"
3505       "jna paeth_abc               \n\t"
3506       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3507       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
3508       "jmp paeth_paeth             \n\t"
3509
3510    "paeth_abc:                     \n\t"
3511       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3512       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
3513
3514    "paeth_paeth:                   \n\t"
3515       "incl %%ebx                  \n\t"
3516       "incl %%edx                  \n\t"
3517       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3518       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3519       "cmpl _dif, %%ebx            \n\t"
3520       "jb paeth_lp1                \n\t"
3521
3522    "paeth_go:                      \n\t"
3523       "movl _FullLength, %%ecx     \n\t"
3524       "movl %%ecx, %%eax           \n\t"
3525       "subl %%ebx, %%eax           \n\t" // subtract alignment fix
3526       "andl $0x00000007, %%eax     \n\t" // calc bytes over mult of 8
3527       "subl %%eax, %%ecx           \n\t" // drop over bytes from original length
3528       "movl %%ecx, _MMXLength      \n\t"
3529 #ifdef __PIC__
3530       "popl %%ebx                  \n\t" // restore index to Global Offset Table
3531 #endif
3532
3533       : "=c" (dummy_value_c),            // output regs (dummy)
3534         "=S" (dummy_value_S),
3535         "=D" (dummy_value_D)
3536
3537       : "0" (bpp),       // ecx          // input regs
3538         "1" (prev_row),  // esi
3539         "2" (row)        // edi
3540
3541       : "%eax", "%edx"                   // clobber list
3542 #ifndef __PIC__
3543       , "%ebx"
3544 #endif
3545    );
3546
3547    // now do the math for the rest of the row
3548    switch (bpp)
3549    {
3550       case 3:
3551       {
3552          _ActiveMask.use = 0x0000000000ffffffLL;
3553          _ActiveMaskEnd.use = 0xffff000000000000LL;
3554          _ShiftBpp.use = 24;    // == bpp(3) * 8
3555          _ShiftRem.use = 40;    // == 64 - 24
3556
3557          __asm__ __volatile__ (
3558             "movl _dif, %%ecx            \n\t"
3559 // preload  "movl row, %%edi             \n\t"
3560 // preload  "movl prev_row, %%esi        \n\t"
3561             "pxor %%mm0, %%mm0           \n\t"
3562             // prime the pump:  load the first Raw(x-bpp) data set
3563             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3564          "paeth_3lp:                     \n\t"
3565             "psrlq _ShiftRem, %%mm1      \n\t" // shift last 3 bytes to 1st
3566                                                // 3 bytes
3567             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3568             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3569             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3570             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3571             "psrlq _ShiftRem, %%mm3      \n\t" // shift last 3 bytes to 1st
3572                                                // 3 bytes
3573             // pav = p - a = (a + b - c) - a = b - c
3574             "movq %%mm2, %%mm4           \n\t"
3575             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3576             // pbv = p - b = (a + b - c) - b = a - c
3577             "movq %%mm1, %%mm5           \n\t"
3578             "psubw %%mm3, %%mm4          \n\t"
3579             "pxor %%mm7, %%mm7           \n\t"
3580             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3581             "movq %%mm4, %%mm6           \n\t"
3582             "psubw %%mm3, %%mm5          \n\t"
3583
3584             // pa = abs(p-a) = abs(pav)
3585             // pb = abs(p-b) = abs(pbv)
3586             // pc = abs(p-c) = abs(pcv)
3587             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3588             "paddw %%mm5, %%mm6          \n\t"
3589             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3590             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3591             "psubw %%mm0, %%mm4          \n\t"
3592             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3593             "psubw %%mm0, %%mm4          \n\t"
3594             "psubw %%mm7, %%mm5          \n\t"
3595             "pxor %%mm0, %%mm0           \n\t"
3596             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3597             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3598             "psubw %%mm7, %%mm5          \n\t"
3599             "psubw %%mm0, %%mm6          \n\t"
3600             //  test pa <= pb
3601             "movq %%mm4, %%mm7           \n\t"
3602             "psubw %%mm0, %%mm6          \n\t"
3603             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3604             "movq %%mm7, %%mm0           \n\t"
3605             // use mm7 mask to merge pa & pb
3606             "pand %%mm7, %%mm5           \n\t"
3607             // use mm0 mask copy to merge a & b
3608             "pand %%mm0, %%mm2           \n\t"
3609             "pandn %%mm4, %%mm7          \n\t"
3610             "pandn %%mm1, %%mm0          \n\t"
3611             "paddw %%mm5, %%mm7          \n\t"
3612             "paddw %%mm2, %%mm0          \n\t"
3613             //  test  ((pa <= pb)? pa:pb) <= pc
3614             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3615             "pxor %%mm1, %%mm1           \n\t"
3616             "pand %%mm7, %%mm3           \n\t"
3617             "pandn %%mm0, %%mm7          \n\t"
3618             "paddw %%mm3, %%mm7          \n\t"
3619             "pxor %%mm0, %%mm0           \n\t"
3620             "packuswb %%mm1, %%mm7       \n\t"
3621             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3622             "pand _ActiveMask, %%mm7     \n\t"
3623             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
3624             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3625             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3626             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3627             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as
3628                                                // Raw(x-bpp)
3629             // now do Paeth for 2nd set of bytes (3-5)
3630             "psrlq _ShiftBpp, %%mm2      \n\t" // load b=Prior(x) step 2
3631             "punpcklbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3632             "pxor %%mm7, %%mm7           \n\t"
3633             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3634             // pbv = p - b = (a + b - c) - b = a - c
3635             "movq %%mm1, %%mm5           \n\t"
3636             // pav = p - a = (a + b - c) - a = b - c
3637             "movq %%mm2, %%mm4           \n\t"
3638             "psubw %%mm3, %%mm5          \n\t"
3639             "psubw %%mm3, %%mm4          \n\t"
3640             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3641             //       pav + pbv = pbv + pav
3642             "movq %%mm5, %%mm6           \n\t"
3643             "paddw %%mm4, %%mm6          \n\t"
3644
3645             // pa = abs(p-a) = abs(pav)
3646             // pb = abs(p-b) = abs(pbv)
3647             // pc = abs(p-c) = abs(pcv)
3648             "pcmpgtw %%mm5, %%mm0        \n\t" // create mask pbv bytes < 0
3649             "pcmpgtw %%mm4, %%mm7        \n\t" // create mask pav bytes < 0
3650             "pand %%mm5, %%mm0           \n\t" // only pbv bytes < 0 in mm0
3651             "pand %%mm4, %%mm7           \n\t" // only pav bytes < 0 in mm7
3652             "psubw %%mm0, %%mm5          \n\t"
3653             "psubw %%mm7, %%mm4          \n\t"
3654             "psubw %%mm0, %%mm5          \n\t"
3655             "psubw %%mm7, %%mm4          \n\t"
3656             "pxor %%mm0, %%mm0           \n\t"
3657             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3658             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3659             "psubw %%mm0, %%mm6          \n\t"
3660             //  test pa <= pb
3661             "movq %%mm4, %%mm7           \n\t"
3662             "psubw %%mm0, %%mm6          \n\t"
3663             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3664             "movq %%mm7, %%mm0           \n\t"
3665             // use mm7 mask to merge pa & pb
3666             "pand %%mm7, %%mm5           \n\t"
3667             // use mm0 mask copy to merge a & b
3668             "pand %%mm0, %%mm2           \n\t"
3669             "pandn %%mm4, %%mm7          \n\t"
3670             "pandn %%mm1, %%mm0          \n\t"
3671             "paddw %%mm5, %%mm7          \n\t"
3672             "paddw %%mm2, %%mm0          \n\t"
3673             //  test  ((pa <= pb)? pa:pb) <= pc
3674             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3675             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3676             "pand %%mm7, %%mm3           \n\t"
3677             "pandn %%mm0, %%mm7          \n\t"
3678             "pxor %%mm1, %%mm1           \n\t"
3679             "paddw %%mm3, %%mm7          \n\t"
3680             "pxor %%mm0, %%mm0           \n\t"
3681             "packuswb %%mm1, %%mm7       \n\t"
3682             "movq %%mm2, %%mm3           \n\t" // load c=Prior(x-bpp) step 1
3683             "pand _ActiveMask, %%mm7     \n\t"
3684             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3685             "psllq _ShiftBpp, %%mm7      \n\t" // shift bytes to 2nd group of
3686                                                // 3 bytes
3687              // pav = p - a = (a + b - c) - a = b - c
3688             "movq %%mm2, %%mm4           \n\t"
3689             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3690             "psllq _ShiftBpp, %%mm3      \n\t" // load c=Prior(x-bpp) step 2
3691             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3692             "movq %%mm7, %%mm1           \n\t"
3693             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3694             "psllq _ShiftBpp, %%mm1      \n\t" // shift bytes
3695                                     // now mm1 will be used as Raw(x-bpp)
3696             // now do Paeth for 3rd, and final, set of bytes (6-7)
3697             "pxor %%mm7, %%mm7           \n\t"
3698             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3699             "psubw %%mm3, %%mm4          \n\t"
3700             // pbv = p - b = (a + b - c) - b = a - c
3701             "movq %%mm1, %%mm5           \n\t"
3702             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3703             "movq %%mm4, %%mm6           \n\t"
3704             "psubw %%mm3, %%mm5          \n\t"
3705             "pxor %%mm0, %%mm0           \n\t"
3706             "paddw %%mm5, %%mm6          \n\t"
3707
3708             // pa = abs(p-a) = abs(pav)
3709             // pb = abs(p-b) = abs(pbv)
3710             // pc = abs(p-c) = abs(pcv)
3711             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3712             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3713             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3714             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3715             "psubw %%mm0, %%mm4          \n\t"
3716             "psubw %%mm7, %%mm5          \n\t"
3717             "psubw %%mm0, %%mm4          \n\t"
3718             "psubw %%mm7, %%mm5          \n\t"
3719             "pxor %%mm0, %%mm0           \n\t"
3720             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3721             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3722             "psubw %%mm0, %%mm6          \n\t"
3723             //  test pa <= pb
3724             "movq %%mm4, %%mm7           \n\t"
3725             "psubw %%mm0, %%mm6          \n\t"
3726             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3727             "movq %%mm7, %%mm0           \n\t"
3728             // use mm0 mask copy to merge a & b
3729             "pand %%mm0, %%mm2           \n\t"
3730             // use mm7 mask to merge pa & pb
3731             "pand %%mm7, %%mm5           \n\t"
3732             "pandn %%mm1, %%mm0          \n\t"
3733             "pandn %%mm4, %%mm7          \n\t"
3734             "paddw %%mm2, %%mm0          \n\t"
3735             "paddw %%mm5, %%mm7          \n\t"
3736             //  test  ((pa <= pb)? pa:pb) <= pc
3737             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3738             "pand %%mm7, %%mm3           \n\t"
3739             "pandn %%mm0, %%mm7          \n\t"
3740             "paddw %%mm3, %%mm7          \n\t"
3741             "pxor %%mm1, %%mm1           \n\t"
3742             "packuswb %%mm7, %%mm1       \n\t"
3743             // step ecx to next set of 8 bytes and repeat loop til done
3744             "addl $8, %%ecx              \n\t"
3745             "pand _ActiveMaskEnd, %%mm1  \n\t"
3746             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with
3747                                                  // Raw(x)
3748
3749             "cmpl _MMXLength, %%ecx      \n\t"
3750             "pxor %%mm0, %%mm0           \n\t" // pxor does not affect flags
3751             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3752                                  // mm1 will be used as Raw(x-bpp) next loop
3753                            // mm3 ready to be used as Prior(x-bpp) next loop
3754             "jb paeth_3lp                \n\t"
3755
3756             : "=S" (dummy_value_S),             // output regs (dummy)
3757               "=D" (dummy_value_D)
3758
3759             : "0" (prev_row),  // esi           // input regs
3760               "1" (row)        // edi
3761
3762             : "%ecx"                            // clobber list
3763 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3764             , "%mm0", "%mm1", "%mm2", "%mm3"
3765             , "%mm4", "%mm5", "%mm6", "%mm7"
3766 #endif
3767          );
3768       }
3769       break;  // end 3 bpp
3770
3771       case 6:
3772       //case 7:   // GRR BOGUS
3773       //case 5:   // GRR BOGUS
3774       {
3775          _ActiveMask.use  = 0x00000000ffffffffLL;
3776          _ActiveMask2.use = 0xffffffff00000000LL;
3777          _ShiftBpp.use = bpp << 3;    // == bpp * 8
3778          _ShiftRem.use = 64 - _ShiftBpp.use;
3779
3780          __asm__ __volatile__ (
3781             "movl _dif, %%ecx            \n\t"
3782 // preload  "movl row, %%edi             \n\t"
3783 // preload  "movl prev_row, %%esi        \n\t"
3784             // prime the pump:  load the first Raw(x-bpp) data set
3785             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3786             "pxor %%mm0, %%mm0           \n\t"
3787
3788          "paeth_6lp:                     \n\t"
3789             // must shift to position Raw(x-bpp) data
3790             "psrlq _ShiftRem, %%mm1      \n\t"
3791             // do first set of 4 bytes
3792             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3793             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3794             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3795             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
3796             // must shift to position Prior(x-bpp) data
3797             "psrlq _ShiftRem, %%mm3      \n\t"
3798             // pav = p - a = (a + b - c) - a = b - c
3799             "movq %%mm2, %%mm4           \n\t"
3800             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
3801             // pbv = p - b = (a + b - c) - b = a - c
3802             "movq %%mm1, %%mm5           \n\t"
3803             "psubw %%mm3, %%mm4          \n\t"
3804             "pxor %%mm7, %%mm7           \n\t"
3805             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3806             "movq %%mm4, %%mm6           \n\t"
3807             "psubw %%mm3, %%mm5          \n\t"
3808             // pa = abs(p-a) = abs(pav)
3809             // pb = abs(p-b) = abs(pbv)
3810             // pc = abs(p-c) = abs(pcv)
3811             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3812             "paddw %%mm5, %%mm6          \n\t"
3813             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3814             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3815             "psubw %%mm0, %%mm4          \n\t"
3816             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3817             "psubw %%mm0, %%mm4          \n\t"
3818             "psubw %%mm7, %%mm5          \n\t"
3819             "pxor %%mm0, %%mm0           \n\t"
3820             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3821             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3822             "psubw %%mm7, %%mm5          \n\t"
3823             "psubw %%mm0, %%mm6          \n\t"
3824             //  test pa <= pb
3825             "movq %%mm4, %%mm7           \n\t"
3826             "psubw %%mm0, %%mm6          \n\t"
3827             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3828             "movq %%mm7, %%mm0           \n\t"
3829             // use mm7 mask to merge pa & pb
3830             "pand %%mm7, %%mm5           \n\t"
3831             // use mm0 mask copy to merge a & b
3832             "pand %%mm0, %%mm2           \n\t"
3833             "pandn %%mm4, %%mm7          \n\t"
3834             "pandn %%mm1, %%mm0          \n\t"
3835             "paddw %%mm5, %%mm7          \n\t"
3836             "paddw %%mm2, %%mm0          \n\t"
3837             //  test  ((pa <= pb)? pa:pb) <= pc
3838             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3839             "pxor %%mm1, %%mm1           \n\t"
3840             "pand %%mm7, %%mm3           \n\t"
3841             "pandn %%mm0, %%mm7          \n\t"
3842             "paddw %%mm3, %%mm7          \n\t"
3843             "pxor %%mm0, %%mm0           \n\t"
3844             "packuswb %%mm1, %%mm7       \n\t"
3845             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3846             "pand _ActiveMask, %%mm7     \n\t"
3847             "psrlq _ShiftRem, %%mm3      \n\t"
3848             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x) step 1
3849             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3850             "movq %%mm2, %%mm6           \n\t"
3851             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
3852             "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3853             "psllq _ShiftBpp, %%mm6      \n\t"
3854             "movq %%mm7, %%mm5           \n\t"
3855             "psrlq _ShiftRem, %%mm1      \n\t"
3856             "por %%mm6, %%mm3            \n\t"
3857             "psllq _ShiftBpp, %%mm5      \n\t"
3858             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3859             "por %%mm5, %%mm1            \n\t"
3860             // do second set of 4 bytes
3861             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3862             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
3863             // pav = p - a = (a + b - c) - a = b - c
3864             "movq %%mm2, %%mm4           \n\t"
3865             // pbv = p - b = (a + b - c) - b = a - c
3866             "movq %%mm1, %%mm5           \n\t"
3867             "psubw %%mm3, %%mm4          \n\t"
3868             "pxor %%mm7, %%mm7           \n\t"
3869             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3870             "movq %%mm4, %%mm6           \n\t"
3871             "psubw %%mm3, %%mm5          \n\t"
3872             // pa = abs(p-a) = abs(pav)
3873             // pb = abs(p-b) = abs(pbv)
3874             // pc = abs(p-c) = abs(pcv)
3875             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3876             "paddw %%mm5, %%mm6          \n\t"
3877             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3878             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3879             "psubw %%mm0, %%mm4          \n\t"
3880             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3881             "psubw %%mm0, %%mm4          \n\t"
3882             "psubw %%mm7, %%mm5          \n\t"
3883             "pxor %%mm0, %%mm0           \n\t"
3884             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3885             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3886             "psubw %%mm7, %%mm5          \n\t"
3887             "psubw %%mm0, %%mm6          \n\t"
3888             //  test pa <= pb
3889             "movq %%mm4, %%mm7           \n\t"
3890             "psubw %%mm0, %%mm6          \n\t"
3891             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3892             "movq %%mm7, %%mm0           \n\t"
3893             // use mm7 mask to merge pa & pb
3894             "pand %%mm7, %%mm5           \n\t"
3895             // use mm0 mask copy to merge a & b
3896             "pand %%mm0, %%mm2           \n\t"
3897             "pandn %%mm4, %%mm7          \n\t"
3898             "pandn %%mm1, %%mm0          \n\t"
3899             "paddw %%mm5, %%mm7          \n\t"
3900             "paddw %%mm2, %%mm0          \n\t"
3901             //  test  ((pa <= pb)? pa:pb) <= pc
3902             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3903             "pxor %%mm1, %%mm1           \n\t"
3904             "pand %%mm7, %%mm3           \n\t"
3905             "pandn %%mm0, %%mm7          \n\t"
3906             "pxor %%mm1, %%mm1           \n\t"
3907             "paddw %%mm3, %%mm7          \n\t"
3908             "pxor %%mm0, %%mm0           \n\t"
3909             // step ecx to next set of 8 bytes and repeat loop til done
3910             "addl $8, %%ecx              \n\t"
3911             "packuswb %%mm7, %%mm1       \n\t"
3912             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3913             "cmpl _MMXLength, %%ecx      \n\t"
3914             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3915                                 // mm1 will be used as Raw(x-bpp) next loop
3916             "jb paeth_6lp                \n\t"
3917
3918             : "=S" (dummy_value_S),             // output regs (dummy)
3919               "=D" (dummy_value_D)
3920
3921             : "0" (prev_row),  // esi           // input regs
3922               "1" (row)        // edi
3923
3924             : "%ecx"                            // clobber list
3925 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3926             , "%mm0", "%mm1", "%mm2", "%mm3"
3927             , "%mm4", "%mm5", "%mm6", "%mm7"
3928 #endif
3929          );
3930       }
3931       break;  // end 6 bpp
3932
3933       case 4:
3934       {
3935          _ActiveMask.use  = 0x00000000ffffffffLL;
3936
3937          __asm__ __volatile__ (
3938             "movl _dif, %%ecx            \n\t"
3939 // preload  "movl row, %%edi             \n\t"
3940 // preload  "movl prev_row, %%esi        \n\t"
3941             "pxor %%mm0, %%mm0           \n\t"
3942             // prime the pump:  load the first Raw(x-bpp) data set
3943             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3944                                      //  a=Raw(x-bpp) bytes
3945          "paeth_4lp:                     \n\t"
3946             // do first set of 4 bytes
3947             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3948             "punpckhbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
3949             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
3950             "punpcklbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
3951             // pav = p - a = (a + b - c) - a = b - c
3952             "movq %%mm2, %%mm4           \n\t"
3953             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
3954             // pbv = p - b = (a + b - c) - b = a - c
3955             "movq %%mm1, %%mm5           \n\t"
3956             "psubw %%mm3, %%mm4          \n\t"
3957             "pxor %%mm7, %%mm7           \n\t"
3958             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3959             "movq %%mm4, %%mm6           \n\t"
3960             "psubw %%mm3, %%mm5          \n\t"
3961             // pa = abs(p-a) = abs(pav)
3962             // pb = abs(p-b) = abs(pbv)
3963             // pc = abs(p-c) = abs(pcv)
3964             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
3965             "paddw %%mm5, %%mm6          \n\t"
3966             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
3967             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
3968             "psubw %%mm0, %%mm4          \n\t"
3969             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
3970             "psubw %%mm0, %%mm4          \n\t"
3971             "psubw %%mm7, %%mm5          \n\t"
3972             "pxor %%mm0, %%mm0           \n\t"
3973             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
3974             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
3975             "psubw %%mm7, %%mm5          \n\t"
3976             "psubw %%mm0, %%mm6          \n\t"
3977             //  test pa <= pb
3978             "movq %%mm4, %%mm7           \n\t"
3979             "psubw %%mm0, %%mm6          \n\t"
3980             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
3981             "movq %%mm7, %%mm0           \n\t"
3982             // use mm7 mask to merge pa & pb
3983             "pand %%mm7, %%mm5           \n\t"
3984             // use mm0 mask copy to merge a & b
3985             "pand %%mm0, %%mm2           \n\t"
3986             "pandn %%mm4, %%mm7          \n\t"
3987             "pandn %%mm1, %%mm0          \n\t"
3988             "paddw %%mm5, %%mm7          \n\t"
3989             "paddw %%mm2, %%mm0          \n\t"
3990             //  test  ((pa <= pb)? pa:pb) <= pc
3991             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
3992             "pxor %%mm1, %%mm1           \n\t"
3993             "pand %%mm7, %%mm3           \n\t"
3994             "pandn %%mm0, %%mm7          \n\t"
3995             "paddw %%mm3, %%mm7          \n\t"
3996             "pxor %%mm0, %%mm0           \n\t"
3997             "packuswb %%mm1, %%mm7       \n\t"
3998             "movq (%%esi,%%ecx,), %%mm3  \n\t" // load c=Prior(x-bpp)
3999             "pand _ActiveMask, %%mm7     \n\t"
4000             "movq %%mm3, %%mm2           \n\t" // load b=Prior(x) step 1
4001             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4002             "punpcklbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4003             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4004             "movq %%mm7, %%mm1           \n\t" // now mm1 will be used as Raw(x-bpp)
4005             // do second set of 4 bytes
4006             "punpckhbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4007             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4008             // pav = p - a = (a + b - c) - a = b - c
4009             "movq %%mm2, %%mm4           \n\t"
4010             // pbv = p - b = (a + b - c) - b = a - c
4011             "movq %%mm1, %%mm5           \n\t"
4012             "psubw %%mm3, %%mm4          \n\t"
4013             "pxor %%mm7, %%mm7           \n\t"
4014             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4015             "movq %%mm4, %%mm6           \n\t"
4016             "psubw %%mm3, %%mm5          \n\t"
4017             // pa = abs(p-a) = abs(pav)
4018             // pb = abs(p-b) = abs(pbv)
4019             // pc = abs(p-c) = abs(pcv)
4020             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4021             "paddw %%mm5, %%mm6          \n\t"
4022             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4023             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4024             "psubw %%mm0, %%mm4          \n\t"
4025             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4026             "psubw %%mm0, %%mm4          \n\t"
4027             "psubw %%mm7, %%mm5          \n\t"
4028             "pxor %%mm0, %%mm0           \n\t"
4029             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4030             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4031             "psubw %%mm7, %%mm5          \n\t"
4032             "psubw %%mm0, %%mm6          \n\t"
4033             //  test pa <= pb
4034             "movq %%mm4, %%mm7           \n\t"
4035             "psubw %%mm0, %%mm6          \n\t"
4036             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4037             "movq %%mm7, %%mm0           \n\t"
4038             // use mm7 mask to merge pa & pb
4039             "pand %%mm7, %%mm5           \n\t"
4040             // use mm0 mask copy to merge a & b
4041             "pand %%mm0, %%mm2           \n\t"
4042             "pandn %%mm4, %%mm7          \n\t"
4043             "pandn %%mm1, %%mm0          \n\t"
4044             "paddw %%mm5, %%mm7          \n\t"
4045             "paddw %%mm2, %%mm0          \n\t"
4046             //  test  ((pa <= pb)? pa:pb) <= pc
4047             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4048             "pxor %%mm1, %%mm1           \n\t"
4049             "pand %%mm7, %%mm3           \n\t"
4050             "pandn %%mm0, %%mm7          \n\t"
4051             "pxor %%mm1, %%mm1           \n\t"
4052             "paddw %%mm3, %%mm7          \n\t"
4053             "pxor %%mm0, %%mm0           \n\t"
4054             // step ecx to next set of 8 bytes and repeat loop til done
4055             "addl $8, %%ecx              \n\t"
4056             "packuswb %%mm7, %%mm1       \n\t"
4057             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
4058             "cmpl _MMXLength, %%ecx      \n\t"
4059             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4060                                 // mm1 will be used as Raw(x-bpp) next loop
4061             "jb paeth_4lp                \n\t"
4062
4063             : "=S" (dummy_value_S),             // output regs (dummy)
4064               "=D" (dummy_value_D)
4065
4066             : "0" (prev_row),  // esi           // input regs
4067               "1" (row)        // edi
4068
4069             : "%ecx"                            // clobber list
4070 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4071             , "%mm0", "%mm1", "%mm2", "%mm3"
4072             , "%mm4", "%mm5", "%mm6", "%mm7"
4073 #endif
4074          );
4075       }
4076       break;  // end 4 bpp
4077
4078       case 8:                          // bpp == 8
4079       {
4080          _ActiveMask.use  = 0x00000000ffffffffLL;
4081
4082          __asm__ __volatile__ (
4083             "movl _dif, %%ecx            \n\t"
4084 // preload  "movl row, %%edi             \n\t"
4085 // preload  "movl prev_row, %%esi        \n\t"
4086             "pxor %%mm0, %%mm0           \n\t"
4087             // prime the pump:  load the first Raw(x-bpp) data set
4088             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
4089                                        //  a=Raw(x-bpp) bytes
4090          "paeth_8lp:                     \n\t"
4091             // do first set of 4 bytes
4092             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4093             "punpcklbw %%mm0, %%mm1      \n\t" // unpack Low bytes of a
4094             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4095             "punpcklbw %%mm0, %%mm2      \n\t" // unpack Low bytes of b
4096             // pav = p - a = (a + b - c) - a = b - c
4097             "movq %%mm2, %%mm4           \n\t"
4098             "punpcklbw %%mm0, %%mm3      \n\t" // unpack Low bytes of c
4099             // pbv = p - b = (a + b - c) - b = a - c
4100             "movq %%mm1, %%mm5           \n\t"
4101             "psubw %%mm3, %%mm4          \n\t"
4102             "pxor %%mm7, %%mm7           \n\t"
4103             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4104             "movq %%mm4, %%mm6           \n\t"
4105             "psubw %%mm3, %%mm5          \n\t"
4106             // pa = abs(p-a) = abs(pav)
4107             // pb = abs(p-b) = abs(pbv)
4108             // pc = abs(p-c) = abs(pcv)
4109             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4110             "paddw %%mm5, %%mm6          \n\t"
4111             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4112             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4113             "psubw %%mm0, %%mm4          \n\t"
4114             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4115             "psubw %%mm0, %%mm4          \n\t"
4116             "psubw %%mm7, %%mm5          \n\t"
4117             "pxor %%mm0, %%mm0           \n\t"
4118             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4119             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4120             "psubw %%mm7, %%mm5          \n\t"
4121             "psubw %%mm0, %%mm6          \n\t"
4122             //  test pa <= pb
4123             "movq %%mm4, %%mm7           \n\t"
4124             "psubw %%mm0, %%mm6          \n\t"
4125             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4126             "movq %%mm7, %%mm0           \n\t"
4127             // use mm7 mask to merge pa & pb
4128             "pand %%mm7, %%mm5           \n\t"
4129             // use mm0 mask copy to merge a & b
4130             "pand %%mm0, %%mm2           \n\t"
4131             "pandn %%mm4, %%mm7          \n\t"
4132             "pandn %%mm1, %%mm0          \n\t"
4133             "paddw %%mm5, %%mm7          \n\t"
4134             "paddw %%mm2, %%mm0          \n\t"
4135             //  test  ((pa <= pb)? pa:pb) <= pc
4136             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4137             "pxor %%mm1, %%mm1           \n\t"
4138             "pand %%mm7, %%mm3           \n\t"
4139             "pandn %%mm0, %%mm7          \n\t"
4140             "paddw %%mm3, %%mm7          \n\t"
4141             "pxor %%mm0, %%mm0           \n\t"
4142             "packuswb %%mm1, %%mm7       \n\t"
4143             "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
4144             "pand _ActiveMask, %%mm7     \n\t"
4145             "movq (%%esi,%%ecx,), %%mm2  \n\t" // load b=Prior(x)
4146             "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
4147             "punpckhbw %%mm0, %%mm3      \n\t" // unpack High bytes of c
4148             "movq %%mm7, (%%edi,%%ecx,)  \n\t" // write back updated value
4149             "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
4150
4151             // do second set of 4 bytes
4152             "punpckhbw %%mm0, %%mm2      \n\t" // unpack High bytes of b
4153             "punpckhbw %%mm0, %%mm1      \n\t" // unpack High bytes of a
4154             // pav = p - a = (a + b - c) - a = b - c
4155             "movq %%mm2, %%mm4           \n\t"
4156             // pbv = p - b = (a + b - c) - b = a - c
4157             "movq %%mm1, %%mm5           \n\t"
4158             "psubw %%mm3, %%mm4          \n\t"
4159             "pxor %%mm7, %%mm7           \n\t"
4160             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4161             "movq %%mm4, %%mm6           \n\t"
4162             "psubw %%mm3, %%mm5          \n\t"
4163             // pa = abs(p-a) = abs(pav)
4164             // pb = abs(p-b) = abs(pbv)
4165             // pc = abs(p-c) = abs(pcv)
4166             "pcmpgtw %%mm4, %%mm0        \n\t" // create mask pav bytes < 0
4167             "paddw %%mm5, %%mm6          \n\t"
4168             "pand %%mm4, %%mm0           \n\t" // only pav bytes < 0 in mm7
4169             "pcmpgtw %%mm5, %%mm7        \n\t" // create mask pbv bytes < 0
4170             "psubw %%mm0, %%mm4          \n\t"
4171             "pand %%mm5, %%mm7           \n\t" // only pbv bytes < 0 in mm0
4172             "psubw %%mm0, %%mm4          \n\t"
4173             "psubw %%mm7, %%mm5          \n\t"
4174             "pxor %%mm0, %%mm0           \n\t"
4175             "pcmpgtw %%mm6, %%mm0        \n\t" // create mask pcv bytes < 0
4176             "pand %%mm6, %%mm0           \n\t" // only pav bytes < 0 in mm7
4177             "psubw %%mm7, %%mm5          \n\t"
4178             "psubw %%mm0, %%mm6          \n\t"
4179             //  test pa <= pb
4180             "movq %%mm4, %%mm7           \n\t"
4181             "psubw %%mm0, %%mm6          \n\t"
4182             "pcmpgtw %%mm5, %%mm7        \n\t" // pa > pb?
4183             "movq %%mm7, %%mm0           \n\t"
4184             // use mm7 mask to merge pa & pb
4185             "pand %%mm7, %%mm5           \n\t"
4186             // use mm0 mask copy to merge a & b
4187             "pand %%mm0, %%mm2           \n\t"
4188             "pandn %%mm4, %%mm7          \n\t"
4189             "pandn %%mm1, %%mm0          \n\t"
4190             "paddw %%mm5, %%mm7          \n\t"
4191             "paddw %%mm2, %%mm0          \n\t"
4192             //  test  ((pa <= pb)? pa:pb) <= pc
4193             "pcmpgtw %%mm6, %%mm7        \n\t" // pab > pc?
4194             "pxor %%mm1, %%mm1           \n\t"
4195             "pand %%mm7, %%mm3           \n\t"
4196             "pandn %%mm0, %%mm7          \n\t"
4197             "pxor %%mm1, %%mm1           \n\t"
4198             "paddw %%mm3, %%mm7          \n\t"
4199             "pxor %%mm0, %%mm0           \n\t"
4200             // step ecx to next set of 8 bytes and repeat loop til done
4201             "addl $8, %%ecx              \n\t"
4202             "packuswb %%mm7, %%mm1       \n\t"
4203             "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
4204             "cmpl _MMXLength, %%ecx      \n\t"
4205             "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
4206                             // mm1 will be used as Raw(x-bpp) next loop
4207             "jb paeth_8lp                \n\t"
4208
4209             : "=S" (dummy_value_S),             // output regs (dummy)
4210               "=D" (dummy_value_D)
4211
4212             : "0" (prev_row),  // esi           // input regs
4213               "1" (row)        // edi
4214
4215             : "%ecx"                            // clobber list
4216 #if 0  /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
4217             , "%mm0", "%mm1", "%mm2", "%mm3"
4218             , "%mm4", "%mm5", "%mm6", "%mm7"
4219 #endif
4220          );
4221       }
4222       break;  // end 8 bpp
4223
4224       case 1:                // bpp = 1
4225       case 2:                // bpp = 2
4226       default:               // bpp > 8
4227       {
4228          __asm__ __volatile__ (
4229 #ifdef __PIC__
4230             "pushl %%ebx                 \n\t" // save Global Offset Table index
4231 #endif
4232             "movl _dif, %%ebx            \n\t"
4233             "cmpl _FullLength, %%ebx     \n\t"
4234             "jnb paeth_dend              \n\t"
4235
4236 // preload  "movl row, %%edi             \n\t"
4237 // preload  "movl prev_row, %%esi        \n\t"
4238             // do Paeth decode for remaining bytes
4239             "movl %%ebx, %%edx           \n\t"
4240 // preload  "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4241             "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4242             "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx
4243
4244          "paeth_dlp:                     \n\t"
4245             "xorl %%eax, %%eax           \n\t"
4246             // pav = p - a = (a + b - c) - a = b - c
4247             "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4248             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4249             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4250             "movl %%eax, _patemp         \n\t" // Save pav for later use
4251             "xorl %%eax, %%eax           \n\t"
4252             // pbv = p - b = (a + b - c) - b = a - c
4253             "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4254             "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4255             "movl %%eax, %%ecx           \n\t"
4256             // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4257             "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4258             // pc = abs(pcv)
4259             "testl $0x80000000, %%eax    \n\t"
4260             "jz paeth_dpca               \n\t"
4261             "negl %%eax                  \n\t" // reverse sign of neg values
4262
4263          "paeth_dpca:                    \n\t"
4264             "movl %%eax, _pctemp         \n\t" // save pc for later use
4265             // pb = abs(pbv)
4266             "testl $0x80000000, %%ecx    \n\t"
4267             "jz paeth_dpba               \n\t"
4268             "negl %%ecx                  \n\t" // reverse sign of neg values
4269
4270          "paeth_dpba:                    \n\t"
4271             "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4272             // pa = abs(pav)
4273             "movl _patemp, %%eax         \n\t"
4274             "testl $0x80000000, %%eax    \n\t"
4275             "jz paeth_dpaa               \n\t"
4276             "negl %%eax                  \n\t" // reverse sign of neg values
4277
4278          "paeth_dpaa:                    \n\t"
4279             "movl %%eax, _patemp         \n\t" // save pa for later use
4280             // test if pa <= pb
4281             "cmpl %%ecx, %%eax           \n\t"
4282             "jna paeth_dabb              \n\t"
4283             // pa > pb; now test if pb <= pc
4284             "cmpl _pctemp, %%ecx         \n\t"
4285             "jna paeth_dbbc              \n\t"
4286             // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4287             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4288             "jmp paeth_dpaeth            \n\t"
4289
4290          "paeth_dbbc:                    \n\t"
4291             // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4292             "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4293             "jmp paeth_dpaeth            \n\t"
4294
4295          "paeth_dabb:                    \n\t"
4296             // pa <= pb; now test if pa <= pc
4297             "cmpl _pctemp, %%eax         \n\t"
4298             "jna paeth_dabc              \n\t"
4299             // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4300             "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4301             "jmp paeth_dpaeth            \n\t"
4302
4303          "paeth_dabc:                    \n\t"
4304             // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4305             "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4306
4307          "paeth_dpaeth:                  \n\t"
4308             "incl %%ebx                  \n\t"
4309             "incl %%edx                  \n\t"
4310             // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4311             "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4312             "cmpl _FullLength, %%ebx     \n\t"
4313             "jb paeth_dlp                \n\t"
4314
4315          "paeth_dend:                    \n\t"
4316 #ifdef __PIC__
4317             "popl %%ebx                  \n\t" // index to Global Offset Table
4318 #endif
4319
4320             : "=c" (dummy_value_c),            // output regs (dummy)
4321               "=S" (dummy_value_S),
4322               "=D" (dummy_value_D)
4323
4324             : "0" (bpp),       // ecx          // input regs
4325               "1" (prev_row),  // esi
4326               "2" (row)        // edi
4327
4328             : "%eax", "%edx"                   // clobber list
4329 #ifndef __PIC__
4330             , "%ebx"
4331 #endif
4332          );
4333       }
4334       return;                   // No need to go further with this one
4335
4336    } // end switch (bpp)
4337
4338    __asm__ __volatile__ (
4339       // MMX acceleration complete; now do clean-up
4340       // check if any remaining bytes left to decode
4341 #ifdef __PIC__
4342       "pushl %%ebx                 \n\t" // save index to Global Offset Table
4343 #endif
4344       "movl _MMXLength, %%ebx      \n\t"
4345       "cmpl _FullLength, %%ebx     \n\t"
4346       "jnb paeth_end               \n\t"
4347 //pre "movl row, %%edi             \n\t"
4348 //pre "movl prev_row, %%esi        \n\t"
4349       // do Paeth decode for remaining bytes
4350       "movl %%ebx, %%edx           \n\t"
4351 //pre "subl bpp, %%edx             \n\t" // (bpp is preloaded into ecx)
4352       "subl %%ecx, %%edx           \n\t" // edx = ebx - bpp
4353       "xorl %%ecx, %%ecx           \n\t" // zero ecx before using cl & cx below
4354
4355    "paeth_lp2:                     \n\t"
4356       "xorl %%eax, %%eax           \n\t"
4357       // pav = p - a = (a + b - c) - a = b - c
4358       "movb (%%esi,%%ebx,), %%al   \n\t" // load Prior(x) into al
4359       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4360       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4361       "movl %%eax, _patemp         \n\t" // Save pav for later use
4362       "xorl %%eax, %%eax           \n\t"
4363       // pbv = p - b = (a + b - c) - b = a - c
4364       "movb (%%edi,%%edx,), %%al   \n\t" // load Raw(x-bpp) into al
4365       "subl %%ecx, %%eax           \n\t" // subtract Prior(x-bpp)
4366       "movl %%eax, %%ecx           \n\t"
4367       // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4368       "addl _patemp, %%eax         \n\t" // pcv = pav + pbv
4369       // pc = abs(pcv)
4370       "testl $0x80000000, %%eax    \n\t"
4371       "jz paeth_pca2               \n\t"
4372       "negl %%eax                  \n\t" // reverse sign of neg values
4373
4374    "paeth_pca2:                    \n\t"
4375       "movl %%eax, _pctemp         \n\t" // save pc for later use
4376       // pb = abs(pbv)
4377       "testl $0x80000000, %%ecx    \n\t"
4378       "jz paeth_pba2               \n\t"
4379       "negl %%ecx                  \n\t" // reverse sign of neg values
4380
4381    "paeth_pba2:                    \n\t"
4382       "movl %%ecx, _pbtemp         \n\t" // save pb for later use
4383       // pa = abs(pav)
4384       "movl _patemp, %%eax         \n\t"
4385       "testl $0x80000000, %%eax    \n\t"
4386       "jz paeth_paa2               \n\t"
4387       "negl %%eax                  \n\t" // reverse sign of neg values
4388
4389    "paeth_paa2:                    \n\t"
4390       "movl %%eax, _patemp         \n\t" // save pa for later use
4391       // test if pa <= pb
4392       "cmpl %%ecx, %%eax           \n\t"
4393       "jna paeth_abb2              \n\t"
4394       // pa > pb; now test if pb <= pc
4395       "cmpl _pctemp, %%ecx         \n\t"
4396       "jna paeth_bbc2              \n\t"
4397       // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4398       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4399       "jmp paeth_paeth2            \n\t"
4400
4401    "paeth_bbc2:                    \n\t"
4402       // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4403       "movb (%%esi,%%ebx,), %%cl   \n\t" // load Prior(x) into cl
4404       "jmp paeth_paeth2            \n\t"
4405
4406    "paeth_abb2:                    \n\t"
4407       // pa <= pb; now test if pa <= pc
4408       "cmpl _pctemp, %%eax         \n\t"
4409       "jna paeth_abc2              \n\t"
4410       // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4411       "movb (%%esi,%%edx,), %%cl   \n\t" // load Prior(x-bpp) into cl
4412       "jmp paeth_paeth2            \n\t"
4413
4414    "paeth_abc2:                    \n\t"
4415       // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4416       "movb (%%edi,%%edx,), %%cl   \n\t" // load Raw(x-bpp) into cl
4417
4418    "paeth_paeth2:                  \n\t"
4419       "incl %%ebx                  \n\t"
4420       "incl %%edx                  \n\t"
4421       // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4422       "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4423       "cmpl _FullLength, %%ebx     \n\t"
4424       "jb paeth_lp2                \n\t"
4425
4426    "paeth_end:                     \n\t"
4427       "EMMS                        \n\t" // end MMX; prep for poss. FP instrs.
4428 #ifdef __PIC__
4429       "popl %%ebx                  \n\t" // restore index to Global Offset Table
4430 #endif
4431
4432       : "=c" (dummy_value_c),            // output regs (dummy)
4433         "=S" (dummy_value_S),
4434         "=D" (dummy_value_D)
4435
4436       : "0" (bpp),       // ecx          // input regs
4437         "1" (prev_row),  // esi
4438         "2" (row)        // edi
4439
4440       : "%eax", "%edx"                   // clobber list (no input regs!)
4441 #ifndef __PIC__
4442       , "%ebx"
4443 #endif
4444    );
4445
4446 } /* end png_read_filter_row_mmx_paeth() */
4447 #endif
4448
4449
4450
4451
4452 #ifdef PNG_THREAD_UNSAFE_OK
4453 //===========================================================================//
4454 //                                                                           //
4455 //           P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B           //
4456 //                                                                           //
4457 //===========================================================================//
4458
4459 // Optimized code for PNG Sub filter decoder
4460
4461 static void /* PRIVATE */
4462 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4463 {
4464    int bpp;
4465    int dummy_value_a;
4466    int dummy_value_D;
4467
4468    bpp = (row_info->pixel_depth + 7) >> 3;   // calc number of bytes per pixel
4469    _FullLength = row_info->rowbytes - bpp;   // number of bytes to filter
4470
4471    __asm__ __volatile__ (
4472 //pre "movl row, %%edi             \n\t"
4473       "movl %%edi, %%esi           \n\t" // lp = row
4474 //pre "movl bpp, %%eax             \n\t"
4475       "addl %%eax, %%edi           \n\t" // rp = row + bpp
4476 //irr "xorl %%eax, %%eax           \n\t"
4477       // get # of bytes to alignment
4478       "movl %%edi, _dif            \n\t" // take start of row
4479       "addl $0xf, _dif             \n\t" // add 7 + 8 to incr past
4480                                          //  alignment boundary
4481       "xorl %%ecx, %%ecx           \n\t"
4482       "andl $0xfffffff8, _dif      \n\t" // mask to alignment boundary
4483       "subl %%edi, _dif            \n\t" // subtract from start ==> value
4484       "jz sub_go                   \n\t" //  ecx at alignment
4485
4486    "sub_lp1:                       \n\t" // fix alignment
4487       "movb (%%esi,%%ecx,), %%al   \n\t"
4488       "addb %%al, (%%edi,%%ecx,)   \n\t"
4489       "incl %%ecx                  \n\t"
4490       "cmpl _dif, %%ecx            \n\t"
4491       "jb sub_lp1                  \n\t"
4492
4493    "sub_go:                        \n\t"
4494       "movl _FullLength, %%eax     \n\t"
4495       "movl %%eax, %%edx           \n\t"
4496       "subl %%ecx, %%edx           \n\t" // subtract alignment fix
4497       "andl $0x00000007, %%edx     \n\t" // calc bytes over mult of 8
4498       "subl %%edx, %%eax           \n\t" // drop over bytes from length
4499       "movl %%eax, _MMXLength      \n\t"
4500
4501       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4502         "=D" (dummy_value_D)    // 1
4503
4504       : "0" (bpp),              // eax    // input regs
4505         "1" (row)               // edi
4506
4507       : "%ebx", "%ecx", "%edx"            // clobber list
4508       , "%esi"
4509
4510 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4511       , "%mm0", "%mm1", "%mm2", "%mm3"
4512       , "%mm4", "%mm5", "%mm6", "%mm7"
4513 #endif
4514    );
4515
4516    // now do the math for the rest of the row
4517    switch (bpp)
4518    {
4519       case 3:
4520       {
4521          _ActiveMask.use  = 0x0000ffffff000000LL;
4522          _ShiftBpp.use = 24;       // == 3 * 8
4523          _ShiftRem.use  = 40;      // == 64 - 24
4524
4525          __asm__ __volatile__ (
4526 // preload  "movl row, %%edi              \n\t"
4527             "movq _ActiveMask, %%mm7       \n\t" // load _ActiveMask for 2nd
4528                                                 //  active byte group
4529             "movl %%edi, %%esi            \n\t" // lp = row
4530 // preload  "movl bpp, %%eax              \n\t"
4531             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4532             "movq %%mm7, %%mm6            \n\t"
4533             "movl _dif, %%edx             \n\t"
4534             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4535                                                 //  3rd active byte group
4536             // prime the pump:  load the first Raw(x-bpp) data set
4537             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4538
4539          "sub_3lp:                        \n\t" // shift data for adding first
4540             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4541                                                 //  shift clears inactive bytes)
4542             // add 1st active group
4543             "movq (%%edi,%%edx,), %%mm0   \n\t"
4544             "paddb %%mm1, %%mm0           \n\t"
4545
4546             // add 2nd active group
4547             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4548             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4549             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4550             "paddb %%mm1, %%mm0           \n\t"
4551
4552             // add 3rd active group
4553             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4554             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4555             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4556             "addl $8, %%edx               \n\t"
4557             "paddb %%mm1, %%mm0           \n\t"
4558
4559             "cmpl _MMXLength, %%edx       \n\t"
4560             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4561             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4562             "jb sub_3lp                   \n\t"
4563
4564             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4565               "=D" (dummy_value_D)    // 1
4566
4567             : "0" (bpp),              // eax    // input regs
4568               "1" (row)               // edi
4569
4570             : "%edx", "%esi"                    // clobber list
4571 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4572             , "%mm0", "%mm1", "%mm6", "%mm7"
4573 #endif
4574          );
4575       }
4576       break;
4577
4578       case 1:
4579       {
4580          __asm__ __volatile__ (
4581             "movl _dif, %%edx            \n\t"
4582 // preload  "movl row, %%edi             \n\t"
4583             "cmpl _FullLength, %%edx     \n\t"
4584             "jnb sub_1end                \n\t"
4585             "movl %%edi, %%esi           \n\t" // lp = row
4586             "xorl %%eax, %%eax           \n\t"
4587 // preload  "movl bpp, %%eax             \n\t"
4588             "addl %%eax, %%edi           \n\t" // rp = row + bpp
4589
4590          "sub_1lp:                       \n\t"
4591             "movb (%%esi,%%edx,), %%al   \n\t"
4592             "addb %%al, (%%edi,%%edx,)   \n\t"
4593             "incl %%edx                  \n\t"
4594             "cmpl _FullLength, %%edx     \n\t"
4595             "jb sub_1lp                  \n\t"
4596
4597          "sub_1end:                      \n\t"
4598
4599             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4600               "=D" (dummy_value_D)    // 1
4601
4602             : "0" (bpp),              // eax    // input regs
4603               "1" (row)               // edi
4604
4605             : "%edx", "%esi"                    // clobber list
4606          );
4607       }
4608       return;
4609
4610       case 6:
4611       case 4:
4612       //case 7:   // GRR BOGUS
4613       //case 5:   // GRR BOGUS
4614       {
4615          _ShiftBpp.use = bpp << 3;
4616          _ShiftRem.use = 64 - _ShiftBpp.use;
4617
4618          __asm__ __volatile__ (
4619 // preload  "movl row, %%edi              \n\t"
4620             "movl _dif, %%edx             \n\t"
4621             "movl %%edi, %%esi            \n\t" // lp = row
4622 // preload  "movl bpp, %%eax              \n\t"
4623             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4624
4625             // prime the pump:  load the first Raw(x-bpp) data set
4626             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4627
4628          "sub_4lp:                        \n\t" // shift data for adding first
4629             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4630                                                 //  shift clears inactive bytes)
4631             "movq (%%edi,%%edx,), %%mm0   \n\t"
4632             "paddb %%mm1, %%mm0           \n\t"
4633
4634             // add 2nd active group
4635             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4636             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4637             "addl $8, %%edx               \n\t"
4638             "paddb %%mm1, %%mm0           \n\t"
4639
4640             "cmpl _MMXLength, %%edx       \n\t"
4641             "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4642             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4643             "jb sub_4lp                   \n\t"
4644
4645             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4646               "=D" (dummy_value_D)    // 1
4647
4648             : "0" (bpp),              // eax    // input regs
4649               "1" (row)               // edi
4650
4651             : "%edx", "%esi"                    // clobber list
4652 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4653             , "%mm0", "%mm1"
4654 #endif
4655          );
4656       }
4657       break;
4658
4659       case 2:
4660       {
4661          _ActiveMask.use = 0x00000000ffff0000LL;
4662          _ShiftBpp.use = 16;       // == 2 * 8
4663          _ShiftRem.use = 48;       // == 64 - 16
4664
4665          __asm__ __volatile__ (
4666             "movq _ActiveMask, %%mm7      \n\t" // load _ActiveMask for 2nd
4667                                                 //  active byte group
4668             "movl _dif, %%edx             \n\t"
4669             "movq %%mm7, %%mm6            \n\t"
4670 // preload  "movl row, %%edi              \n\t"
4671             "psllq _ShiftBpp, %%mm6       \n\t" // move mask in mm6 to cover
4672                                                 //  3rd active byte group
4673             "movl %%edi, %%esi            \n\t" // lp = row
4674             "movq %%mm6, %%mm5            \n\t"
4675 // preload  "movl bpp, %%eax              \n\t"
4676             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4677             "psllq _ShiftBpp, %%mm5       \n\t" // move mask in mm5 to cover
4678                                                 //  4th active byte group
4679             // prime the pump:  load the first Raw(x-bpp) data set
4680             "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4681
4682          "sub_2lp:                        \n\t" // shift data for adding first
4683             "psrlq _ShiftRem, %%mm1       \n\t" //  bpp bytes (no need for mask;
4684                                                 //  shift clears inactive bytes)
4685             // add 1st active group
4686             "movq (%%edi,%%edx,), %%mm0   \n\t"
4687             "paddb %%mm1, %%mm0           \n\t"
4688
4689             // add 2nd active group
4690             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4691             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4692             "pand %%mm7, %%mm1            \n\t" // mask to use 2nd active group
4693             "paddb %%mm1, %%mm0           \n\t"
4694
4695             // add 3rd active group
4696             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4697             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4698             "pand %%mm6, %%mm1            \n\t" // mask to use 3rd active group
4699             "paddb %%mm1, %%mm0           \n\t"
4700
4701             // add 4th active group
4702             "movq %%mm0, %%mm1            \n\t" // mov updated Raws to mm1
4703             "psllq _ShiftBpp, %%mm1       \n\t" // shift data to pos. correctly
4704             "pand %%mm5, %%mm1            \n\t" // mask to use 4th active group
4705             "addl $8, %%edx               \n\t"
4706             "paddb %%mm1, %%mm0           \n\t"
4707             "cmpl _MMXLength, %%edx       \n\t"
4708             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4709             "movq %%mm0, %%mm1            \n\t" // prep 1st add at top of loop
4710             "jb sub_2lp                   \n\t"
4711
4712             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4713               "=D" (dummy_value_D)    // 1
4714
4715             : "0" (bpp),              // eax    // input regs
4716               "1" (row)               // edi
4717
4718             : "%edx", "%esi"                    // clobber list
4719 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4720             , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4721 #endif
4722          );
4723       }
4724       break;
4725
4726       case 8:
4727       {
4728          __asm__ __volatile__ (
4729 // preload  "movl row, %%edi              \n\t"
4730             "movl _dif, %%edx             \n\t"
4731             "movl %%edi, %%esi            \n\t" // lp = row
4732 // preload  "movl bpp, %%eax              \n\t"
4733             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4734             "movl _MMXLength, %%ecx       \n\t"
4735
4736             // prime the pump:  load the first Raw(x-bpp) data set
4737             "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4738             "andl $0x0000003f, %%ecx      \n\t" // calc bytes over mult of 64
4739
4740          "sub_8lp:                        \n\t"
4741             "movq (%%edi,%%edx,), %%mm0   \n\t" // load Sub(x) for 1st 8 bytes
4742             "paddb %%mm7, %%mm0           \n\t"
4743             "movq 8(%%edi,%%edx,), %%mm1  \n\t" // load Sub(x) for 2nd 8 bytes
4744             "movq %%mm0, (%%edi,%%edx,)   \n\t" // write Raw(x) for 1st 8 bytes
4745
4746             // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4747             // This will be repeated for each group of 8 bytes with the 8th
4748             // group being used as the Raw(x-bpp) for the 1st group of the
4749             // next loop.
4750
4751             "paddb %%mm0, %%mm1           \n\t"
4752             "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4753             "movq %%mm1, 8(%%edi,%%edx,)  \n\t" // write Raw(x) for 2nd 8 bytes
4754             "paddb %%mm1, %%mm2           \n\t"
4755             "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4756             "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4757             "paddb %%mm2, %%mm3           \n\t"
4758             "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4759             "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4760             "paddb %%mm3, %%mm4           \n\t"
4761             "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4762             "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4763             "paddb %%mm4, %%mm5           \n\t"
4764             "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4765             "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4766             "paddb %%mm5, %%mm6           \n\t"
4767             "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4768             "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4769             "addl $64, %%edx              \n\t"
4770             "paddb %%mm6, %%mm7           \n\t"
4771             "cmpl %%ecx, %%edx            \n\t"
4772             "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4773             "jb sub_8lp                   \n\t"
4774
4775             "cmpl _MMXLength, %%edx       \n\t"
4776             "jnb sub_8lt8                 \n\t"
4777
4778          "sub_8lpA:                       \n\t"
4779             "movq (%%edi,%%edx,), %%mm0   \n\t"
4780             "addl $8, %%edx               \n\t"
4781             "paddb %%mm7, %%mm0           \n\t"
4782             "cmpl _MMXLength, %%edx       \n\t"
4783             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4784             "movq %%mm0, %%mm7            \n\t" // move calculated Raw(x) data
4785                                                 //  to mm1 to be new Raw(x-bpp)
4786                                                 //  for next loop
4787             "jb sub_8lpA                  \n\t"
4788
4789          "sub_8lt8:                       \n\t"
4790
4791             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4792               "=D" (dummy_value_D)    // 1
4793
4794             : "0" (bpp),              // eax    // input regs
4795               "1" (row)               // edi
4796
4797             : "%ecx", "%edx", "%esi"            // clobber list
4798 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4799             , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4800 #endif
4801          );
4802       }
4803       break;
4804
4805       default:                // bpp greater than 8 bytes       GRR BOGUS
4806       {
4807          __asm__ __volatile__ (
4808             "movl _dif, %%edx             \n\t"
4809 // preload  "movl row, %%edi              \n\t"
4810             "movl %%edi, %%esi            \n\t" // lp = row
4811 // preload  "movl bpp, %%eax              \n\t"
4812             "addl %%eax, %%edi            \n\t" // rp = row + bpp
4813
4814          "sub_Alp:                        \n\t"
4815             "movq (%%edi,%%edx,), %%mm0   \n\t"
4816             "movq (%%esi,%%edx,), %%mm1   \n\t"
4817             "addl $8, %%edx               \n\t"
4818             "paddb %%mm1, %%mm0           \n\t"
4819             "cmpl _MMXLength, %%edx       \n\t"
4820             "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4821                                                 //  -8 to offset addl edx
4822             "jb sub_Alp                   \n\t"
4823
4824             : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4825               "=D" (dummy_value_D)    // 1
4826
4827             : "0" (bpp),              // eax    // input regs
4828               "1" (row)               // edi
4829
4830             : "%edx", "%esi"                    // clobber list
4831 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4832             , "%mm0", "%mm1"
4833 #endif
4834          );
4835       }
4836       break;
4837
4838    } // end switch (bpp)
4839
4840    __asm__ __volatile__ (
4841       "movl _MMXLength, %%edx       \n\t"
4842 //pre "movl row, %%edi              \n\t"
4843       "cmpl _FullLength, %%edx      \n\t"
4844       "jnb sub_end                  \n\t"
4845
4846       "movl %%edi, %%esi            \n\t" // lp = row
4847 //pre "movl bpp, %%eax              \n\t"
4848       "addl %%eax, %%edi            \n\t" // rp = row + bpp
4849       "xorl %%eax, %%eax            \n\t"
4850
4851    "sub_lp2:                        \n\t"
4852       "movb (%%esi,%%edx,), %%al    \n\t"
4853       "addb %%al, (%%edi,%%edx,)    \n\t"
4854       "incl %%edx                   \n\t"
4855       "cmpl _FullLength, %%edx      \n\t"
4856       "jb sub_lp2                   \n\t"
4857
4858    "sub_end:                        \n\t"
4859       "EMMS                         \n\t" // end MMX instructions
4860
4861       : "=a" (dummy_value_a),   // 0      // output regs (dummy)
4862         "=D" (dummy_value_D)    // 1
4863
4864       : "0" (bpp),              // eax    // input regs
4865         "1" (row)               // edi
4866
4867       : "%edx", "%esi"                    // clobber list
4868    );
4869
4870 } // end of png_read_filter_row_mmx_sub()
4871 #endif
4872
4873
4874
4875
4876 //===========================================================================//
4877 //                                                                           //
4878 //            P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P            //
4879 //                                                                           //
4880 //===========================================================================//
4881
4882 // Optimized code for PNG Up filter decoder
4883
4884 static void /* PRIVATE */
4885 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4886                            png_bytep prev_row)
4887 {
4888    png_uint_32 len;
4889    int dummy_value_d;   // fix 'forbidden register 3 (dx) was spilled' error
4890    int dummy_value_S;
4891    int dummy_value_D;
4892
4893    len = row_info->rowbytes;              // number of bytes to filter
4894
4895    __asm__ __volatile__ (
4896 //pre "movl row, %%edi              \n\t"
4897       // get # of bytes to alignment
4898       "movl %%edi, %%ecx            \n\t"
4899       "xorl %%ebx, %%ebx            \n\t"
4900       "addl $0x7, %%ecx             \n\t"
4901       "xorl %%eax, %%eax            \n\t"
4902       "andl $0xfffffff8, %%ecx      \n\t"
4903 //pre "movl prev_row, %%esi         \n\t"
4904       "subl %%edi, %%ecx            \n\t"
4905       "jz up_go                     \n\t"
4906
4907    "up_lp1:                         \n\t" // fix alignment
4908       "movb (%%edi,%%ebx,), %%al    \n\t"
4909       "addb (%%esi,%%ebx,), %%al    \n\t"
4910       "incl %%ebx                   \n\t"
4911       "cmpl %%ecx, %%ebx            \n\t"
4912       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4913       "jb up_lp1                    \n\t" //  offset incl ebx
4914
4915    "up_go:                          \n\t"
4916 //pre "movl len, %%edx              \n\t"
4917       "movl %%edx, %%ecx            \n\t"
4918       "subl %%ebx, %%edx            \n\t" // subtract alignment fix
4919       "andl $0x0000003f, %%edx      \n\t" // calc bytes over mult of 64
4920       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4921
4922       // unrolled loop - use all MMX registers and interleave to reduce
4923       // number of branch instructions (loops) and reduce partial stalls
4924    "up_loop:                        \n\t"
4925       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4926       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4927       "movq 8(%%esi,%%ebx,), %%mm3  \n\t"
4928       "paddb %%mm1, %%mm0           \n\t"
4929       "movq 8(%%edi,%%ebx,), %%mm2  \n\t"
4930       "movq %%mm0, (%%edi,%%ebx,)   \n\t"
4931       "paddb %%mm3, %%mm2           \n\t"
4932       "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4933       "movq %%mm2, 8(%%edi,%%ebx,)  \n\t"
4934       "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4935       "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4936       "paddb %%mm5, %%mm4           \n\t"
4937       "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4938       "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4939       "paddb %%mm7, %%mm6           \n\t"
4940       "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4941       "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4942       "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4943       "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4944       "paddb %%mm1, %%mm0           \n\t"
4945       "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4946       "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4947       "paddb %%mm3, %%mm2           \n\t"
4948       "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4949       "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4950       "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4951       "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4952       "paddb %%mm5, %%mm4           \n\t"
4953       "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4954       "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4955       "addl $64, %%ebx              \n\t"
4956       "paddb %%mm7, %%mm6           \n\t"
4957       "cmpl %%ecx, %%ebx            \n\t"
4958       "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4959       "jb up_loop                   \n\t" //  -8 to offset addl ebx
4960
4961       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 64
4962       "jz up_end                    \n\t"
4963
4964       "cmpl $8, %%edx               \n\t" // test for less than 8 bytes
4965       "jb up_lt8                    \n\t" //  [added by lcreeve@netins.net]
4966
4967       "addl %%edx, %%ecx            \n\t"
4968       "andl $0x00000007, %%edx      \n\t" // calc bytes over mult of 8
4969       "subl %%edx, %%ecx            \n\t" // drop over bytes from length
4970       "jz up_lt8                    \n\t"
4971
4972    "up_lpA:                         \n\t" // use MMX regs to update 8 bytes sim.
4973       "movq (%%esi,%%ebx,), %%mm1   \n\t"
4974       "movq (%%edi,%%ebx,), %%mm0   \n\t"
4975       "addl $8, %%ebx               \n\t"
4976       "paddb %%mm1, %%mm0           \n\t"
4977       "cmpl %%ecx, %%ebx            \n\t"
4978       "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4979       "jb up_lpA                    \n\t" //  offset add ebx
4980       "cmpl $0, %%edx               \n\t" // test for bytes over mult of 8
4981       "jz up_end                    \n\t"
4982
4983    "up_lt8:                         \n\t"
4984       "xorl %%eax, %%eax            \n\t"
4985       "addl %%edx, %%ecx            \n\t" // move over byte count into counter
4986
4987    "up_lp2:                         \n\t" // use x86 regs for remaining bytes
4988       "movb (%%edi,%%ebx,), %%al    \n\t"
4989       "addb (%%esi,%%ebx,), %%al    \n\t"
4990       "incl %%ebx                   \n\t"
4991       "cmpl %%ecx, %%ebx            \n\t"
4992       "movb %%al, -1(%%edi,%%ebx,)  \n\t" // mov does not affect flags; -1 to
4993       "jb up_lp2                    \n\t" //  offset inc ebx
4994
4995    "up_end:                         \n\t"
4996       "EMMS                         \n\t" // conversion of filtered row complete
4997
4998       : "=d" (dummy_value_d),   // 0      // output regs (dummy)
4999         "=S" (dummy_value_S),   // 1
5000         "=D" (dummy_value_D)    // 2
5001
5002       : "0" (len),              // edx    // input regs
5003         "1" (prev_row),         // esi
5004         "2" (row)               // edi
5005
5006       : "%eax", "%ebx", "%ecx"            // clobber list (no input regs!)
5007
5008 #if 0  /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
5009       , "%mm0", "%mm1", "%mm2", "%mm3"
5010       , "%mm4", "%mm5", "%mm6", "%mm7"
5011 #endif
5012    );
5013
5014 } // end of png_read_filter_row_mmx_up()
5015
5016 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5017
5018
5019
5020
5021 /*===========================================================================*/
5022 /*                                                                           */
5023 /*                   P N G _ R E A D _ F I L T E R _ R O W                   */
5024 /*                                                                           */
5025 /*===========================================================================*/
5026
5027
5028 /* Optimized png_read_filter_row routines */
5029
5030 void /* PRIVATE */
5031 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
5032    row, png_bytep prev_row, int filter)
5033 {
5034 #ifdef PNG_DEBUG
5035    char filnm[10];
5036 #endif
5037
5038 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5039 /* GRR:  these are superseded by png_ptr->asm_flags: */
5040 #define UseMMX_sub    1   // GRR:  converted 20000730
5041 #define UseMMX_up     1   // GRR:  converted 20000729
5042 #define UseMMX_avg    1   // GRR:  converted 20000828 (+ 16-bit bugfix 20000916)
5043 #define UseMMX_paeth  1   // GRR:  converted 20000828
5044
5045    if (_mmx_supported == 2) {
5046        /* this should have happened in png_init_mmx_flags() already */
5047        png_warning(png_ptr, "asm_flags may not have been initialized");
5048        png_mmx_support();
5049    }
5050 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5051
5052 #ifdef PNG_DEBUG
5053    png_debug(1, "in png_read_filter_row (pnggccrd.c)\n");
5054    switch (filter)
5055    {
5056       case 0: sprintf(filnm, "none");
5057          break;
5058       case 1: sprintf(filnm, "sub-%s",
5059 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5060         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB)? "MMX" : 
5061 #endif
5062 "x86");
5063          break;
5064       case 2: sprintf(filnm, "up-%s",
5065 #ifdef PNG_ASSEMBLER_CODE_SUPPORTED
5066         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP)? "MMX" :
5067 #endif
5068  "x86");
5069          break;
5070       case 3: sprintf(filnm, "avg-%s",
5071 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5072         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG)? "MMX" :
5073 #endif
5074  "x86");
5075          break;
5076       case 4: sprintf(filnm, "Paeth-%s",
5077 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5078         (png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH)? "MMX":
5079 #endif
5080 "x86");
5081          break;
5082       default: sprintf(filnm, "unknw");
5083          break;
5084    }
5085    png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
5086    png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
5087    png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
5088       (int)((row_info->pixel_depth + 7) >> 3));
5089    png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
5090 #endif /* PNG_DEBUG */
5091
5092    switch (filter)
5093    {
5094       case PNG_FILTER_VALUE_NONE:
5095          break;
5096
5097       case PNG_FILTER_VALUE_SUB:
5098 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5099          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_SUB) &&
5100              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5101              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5102          {
5103             png_read_filter_row_mmx_sub(row_info, row);
5104          }
5105          else
5106 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5107          {
5108             png_uint_32 i;
5109             png_uint_32 istop = row_info->rowbytes;
5110             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5111             png_bytep rp = row + bpp;
5112             png_bytep lp = row;
5113
5114             for (i = bpp; i < istop; i++)
5115             {
5116                *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
5117                rp++;
5118             }
5119          }  /* end !UseMMX_sub */
5120          break;
5121
5122       case PNG_FILTER_VALUE_UP:
5123 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED)
5124          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_UP) &&
5125              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5126              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5127          {
5128             png_read_filter_row_mmx_up(row_info, row, prev_row);
5129          }
5130           else
5131 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5132          {
5133             png_uint_32 i;
5134             png_uint_32 istop = row_info->rowbytes;
5135             png_bytep rp = row;
5136             png_bytep pp = prev_row;
5137
5138             for (i = 0; i < istop; ++i)
5139             {
5140                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5141                rp++;
5142             }
5143          }  /* end !UseMMX_up */
5144          break;
5145
5146       case PNG_FILTER_VALUE_AVG:
5147 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5148          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_AVG) &&
5149              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5150              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5151          {
5152             png_read_filter_row_mmx_avg(row_info, row, prev_row);
5153          }
5154          else
5155 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5156          {
5157             png_uint_32 i;
5158             png_bytep rp = row;
5159             png_bytep pp = prev_row;
5160             png_bytep lp = row;
5161             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5162             png_uint_32 istop = row_info->rowbytes - bpp;
5163
5164             for (i = 0; i < bpp; i++)
5165             {
5166                *rp = (png_byte)(((int)(*rp) +
5167                   ((int)(*pp++) >> 1)) & 0xff);
5168                rp++;
5169             }
5170
5171             for (i = 0; i < istop; i++)
5172             {
5173                *rp = (png_byte)(((int)(*rp) +
5174                   ((int)(*pp++ + *lp++) >> 1)) & 0xff);
5175                rp++;
5176             }
5177          }  /* end !UseMMX_avg */
5178          break;
5179
5180       case PNG_FILTER_VALUE_PAETH:
5181 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_THREAD_UNSAFE_OK)
5182          if ((png_ptr->asm_flags & PNG_ASM_FLAG_MMX_READ_FILTER_PAETH) &&
5183              (row_info->pixel_depth >= png_ptr->mmx_bitdepth_threshold) &&
5184              (row_info->rowbytes >= png_ptr->mmx_rowbytes_threshold))
5185          {
5186             png_read_filter_row_mmx_paeth(row_info, row, prev_row);
5187          }
5188          else
5189 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED */
5190          {
5191             png_uint_32 i;
5192             png_bytep rp = row;
5193             png_bytep pp = prev_row;
5194             png_bytep lp = row;
5195             png_bytep cp = prev_row;
5196             png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
5197             png_uint_32 istop = row_info->rowbytes - bpp;
5198
5199             for (i = 0; i < bpp; i++)
5200             {
5201                *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
5202                rp++;
5203             }
5204
5205             for (i = 0; i < istop; i++)   /* use leftover rp,pp */
5206             {
5207                int a, b, c, pa, pb, pc, p;
5208
5209                a = *lp++;
5210                b = *pp++;
5211                c = *cp++;
5212
5213                p = b - c;
5214                pc = a - c;
5215
5216 #ifdef PNG_USE_ABS
5217                pa = abs(p);
5218                pb = abs(pc);
5219                pc = abs(p + pc);
5220 #else
5221                pa = p < 0 ? -p : p;
5222                pb = pc < 0 ? -pc : pc;
5223                pc = (p + pc) < 0 ? -(p + pc) : p + pc;
5224 #endif
5225
5226                /*
5227                   if (pa <= pb && pa <= pc)
5228                      p = a;
5229                   else if (pb <= pc)
5230                      p = b;
5231                   else
5232                      p = c;
5233                 */
5234
5235                p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
5236
5237                *rp = (png_byte)(((int)(*rp) + p) & 0xff);
5238                rp++;
5239             }
5240          }  /* end !UseMMX_paeth */
5241          break;
5242
5243       default:
5244          png_warning(png_ptr, "Ignoring bad row-filter type");
5245          *row=0;
5246          break;
5247    }
5248 }
5249
5250 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
5251
5252
5253 /*===========================================================================*/
5254 /*                                                                           */
5255 /*                      P N G _ M M X _ S U P P O R T                        */
5256 /*                                                                           */
5257 /*===========================================================================*/
5258
5259 /* GRR NOTES:  (1) the following code assumes 386 or better (pushfl/popfl)
5260  *             (2) all instructions compile with gcc 2.7.2.3 and later
5261  *             (3) the function is moved down here to prevent gcc from
5262  *                  inlining it in multiple places and then barfing be-
5263  *                  cause the ".NOT_SUPPORTED" label is multiply defined
5264  *             [is there a way to signal that a *single* function should
5265  *              not be inlined?  is there a way to modify the label for
5266  *              each inlined instance, e.g., by appending _1, _2, etc.?
5267  *              maybe if don't use leading "." in label name? (nope...sigh)]
5268  */
5269
5270 int PNGAPI
5271 png_mmx_support(void)
5272 {
5273 #if defined(PNG_MMX_CODE_SUPPORTED)
5274     __asm__ __volatile__ (
5275         "pushl %%ebx          \n\t"  // ebx gets clobbered by CPUID instruction
5276         "pushl %%ecx          \n\t"  // so does ecx...
5277         "pushl %%edx          \n\t"  // ...and edx (but ecx & edx safe on Linux)
5278 //      ".byte  0x66          \n\t"  // convert 16-bit pushf to 32-bit pushfd
5279 //      "pushf                \n\t"  // 16-bit pushf
5280         "pushfl               \n\t"  // save Eflag to stack
5281         "popl %%eax           \n\t"  // get Eflag from stack into eax
5282         "movl %%eax, %%ecx    \n\t"  // make another copy of Eflag in ecx
5283         "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
5284         "pushl %%eax          \n\t"  // save modified Eflag back to stack
5285 //      ".byte  0x66          \n\t"  // convert 16-bit popf to 32-bit popfd
5286 //      "popf                 \n\t"  // 16-bit popf
5287         "popfl                \n\t"  // restore modified value to Eflag reg
5288         "pushfl               \n\t"  // save Eflag to stack
5289         "popl %%eax           \n\t"  // get Eflag from stack
5290         "xorl %%ecx, %%eax    \n\t"  // compare new Eflag with original Eflag
5291         "jz .NOT_SUPPORTED    \n\t"  // if same, CPUID instr. is not supported
5292
5293         "xorl %%eax, %%eax    \n\t"  // set eax to zero
5294 //      ".byte  0x0f, 0xa2    \n\t"  // CPUID instruction (two-byte opcode)
5295         "cpuid                \n\t"  // get the CPU identification info
5296         "cmpl $1, %%eax       \n\t"  // make sure eax return non-zero value
5297         "jl .NOT_SUPPORTED    \n\t"  // if eax is zero, MMX is not supported
5298
5299         "xorl %%eax, %%eax    \n\t"  // set eax to zero and...
5300         "incl %%eax           \n\t"  // ...increment eax to 1.  This pair is
5301                                      // faster than the instruction "mov eax, 1"
5302         "cpuid                \n\t"  // get the CPU identification info again
5303         "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5304         "cmpl $0, %%edx       \n\t"  // 0 = MMX not supported
5305         "jz .NOT_SUPPORTED    \n\t"  // non-zero = yes, MMX IS supported
5306
5307         "movl $1, %%eax       \n\t"  // set return value to 1
5308         "jmp  .RETURN         \n\t"  // DONE:  have MMX support
5309
5310     ".NOT_SUPPORTED:          \n\t"  // target label for jump instructions
5311         "movl $0, %%eax       \n\t"  // set return value to 0
5312     ".RETURN:          \n\t"  // target label for jump instructions
5313         "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5314         "popl %%edx           \n\t"  // restore edx
5315         "popl %%ecx           \n\t"  // restore ecx
5316         "popl %%ebx           \n\t"  // restore ebx
5317
5318 //      "ret                  \n\t"  // DONE:  no MMX support
5319                                      // (fall through to standard C "ret")
5320
5321         :                            // output list (none)
5322
5323         :                            // any variables used on input (none)
5324
5325         : "%eax"                     // clobber list
5326 //      , "%ebx", "%ecx", "%edx"     // GRR:  we handle these manually
5327 //      , "memory"   // if write to a variable gcc thought was in a reg
5328 //      , "cc"       // "condition codes" (flag bits)
5329     );
5330 #else     
5331     _mmx_supported = 0;
5332 #endif /* PNG_MMX_CODE_SUPPORTED */
5333
5334     return _mmx_supported;
5335 }
5336
5337
5338 #endif /* PNG_USE_PNGGCCRD */