1 /* pnggccrd.c - mixed C/assembler version of utilities to read a PNG file
3 * For Intel x86 CPU (Pentium-MMX or later) and GNU C compiler.
5 * See http://www.intel.com/drg/pentiumII/appnotes/916/916.htm
6 * and http://www.intel.com/drg/pentiumII/appnotes/923/923.htm
7 * for Intel's performance analysis of the MMX vs. non-MMX code.
9 * libpng 1.0.9 - January 31, 2001
10 * For conditions of distribution and use, see copyright notice in png.h
11 * Copyright (c) 1998-2001 Glenn Randers-Pehrson
12 * Copyright (c) 1998, Intel Corporation
14 * Based on MSVC code contributed by Nirav Chhatrapati, Intel Corp., 1998.
15 * Interface to libpng contributed by Gilles Vollant, 1999.
16 * GNU C port by Greg Roelofs, 1999-2001.
18 * Lines 2350-4300 converted in place with intel2gas 1.3.1:
20 * intel2gas -mdI pnggccrd.c.partially-msvc -o pnggccrd.c
22 * and then cleaned up by hand. See http://hermes.terminal.at/intel2gas/ .
24 * NOTE: A sufficiently recent version of GNU as (or as.exe under DOS/Windows)
25 * is required to assemble the newer MMX instructions such as movq.
28 * ftp://ftp.simtel.net/pub/simtelnet/gnu/djgpp/v2gnu/bnu281b.zip
30 * (or a later version in the same directory). For Linux, check your
31 * distribution's web site(s) or try these links:
33 * http://rufus.w3.org/linux/RPM/binutils.html
34 * http://www.debian.org/Packages/stable/devel/binutils.html
35 * ftp://ftp.slackware.com/pub/linux/slackware/slackware/slakware/d1/
38 * For other platforms, see the main GNU site:
40 * ftp://ftp.gnu.org/pub/gnu/binutils/
42 * Version 2.5.2l.15 is definitely too old...
46 * TEMPORARY PORTING NOTES AND CHANGELOG (mostly by Greg Roelofs)
47 * =====================================
50 * - fixed sign error in post-MMX cleanup code (16- & 32-bit cases)
53 * - additional optimizations (possible or definite):
54 * x [DONE] write MMX code for 64-bit case (pixel_bytes == 8) [not tested]
55 * - write MMX code for 48-bit case (pixel_bytes == 6)
56 * - figure out what's up with 24-bit case (pixel_bytes == 3):
57 * why subtract 8 from width_mmx in the pass 4/5 case?
58 * (only width_mmx case) (near line 1606)
59 * x [DONE] replace pixel_bytes within each block with the true
60 * constant value (or are compilers smart enough to do that?)
61 * - rewrite all MMX interlacing code so it's aligned with
62 * the *beginning* of the row buffer, not the end. This
63 * would not only allow one to eliminate half of the memory
64 * writes for odd passes (that is, pass == odd), it may also
65 * eliminate some unaligned-data-access exceptions (assuming
66 * there's a penalty for not aligning 64-bit accesses on
67 * 64-bit boundaries). The only catch is that the "leftover"
68 * pixel(s) at the end of the row would have to be saved,
69 * but there are enough unused MMX registers in every case,
70 * so this is not a problem. A further benefit is that the
71 * post-MMX cleanup code (C code) in at least some of the
72 * cases could be done within the assembler block.
73 * x [DONE] the "v3 v2 v1 v0 v7 v6 v5 v4" comments are confusing,
74 * inconsistent, and don't match the MMX Programmer's Reference
75 * Manual conventions anyway. They should be changed to
76 * "b7 b6 b5 b4 b3 b2 b1 b0," where b0 indicates the byte that
77 * was lowest in memory (e.g., corresponding to a left pixel)
78 * and b7 is the byte that was highest (e.g., a right pixel).
81 * - Brennan's Guide notwithstanding, gcc under Linux does *not*
82 * want globals prefixed by underscores when referencing them--
83 * i.e., if the variable is const4, then refer to it as const4,
84 * not _const4. This seems to be a djgpp-specific requirement.
85 * Also, such variables apparently *must* be declared outside
86 * of functions; neither static nor automatic variables work if
87 * defined within the scope of a single function, but both
88 * static and truly global (multi-module) variables work fine.
91 * - fixed png_combine_row() non-MMX replication bug (odd passes only?)
92 * - switched from string-concatenation-with-macros to cleaner method of
93 * renaming global variables for djgpp--i.e., always use prefixes in
94 * inlined assembler code (== strings) and conditionally rename the
95 * variables, not the other way around. Hence _const4, _mask8_0, etc.
98 * - fixed mmxsupport()/png_do_interlace() first-row bug
99 * This one was severely weird: even though mmxsupport() doesn't touch
100 * ebx (where "row" pointer was stored), it nevertheless managed to zero
101 * the register (even in static/non-fPIC code--see below), which in turn
102 * caused png_do_interlace() to return prematurely on the first row of
103 * interlaced images (i.e., without expanding the interlaced pixels).
104 * Inspection of the generated assembly code didn't turn up any clues,
105 * although it did point at a minor optimization (i.e., get rid of
106 * mmx_supported_local variable and just use eax). Possibly the CPUID
107 * instruction is more destructive than it looks? (Not yet checked.)
108 * - "info gcc" was next to useless, so compared fPIC and non-fPIC assembly
109 * listings... Apparently register spillage has to do with ebx, since
110 * it's used to index the global offset table. Commenting it out of the
111 * input-reg lists in png_combine_row() eliminated compiler barfage, so
112 * ifdef'd with __PIC__ macro: if defined, use a global for unmask
115 * - verified CPUID clobberage: 12-char string constant ("GenuineIntel",
116 * "AuthenticAMD", etc.) placed in ebx:ecx:edx. Still need to polish.
119 * - made "diff" variable (now "_dif") global to simplify conversion of
120 * filtering routines (running out of regs, sigh). "diff" is still used
121 * in interlacing routines, however.
122 * - fixed up both versions of mmxsupport() (ORIG_THAT_USED_TO_CLOBBER_EBX
123 * macro determines which is used); original not yet tested.
126 * - when compiling with gcc, be sure to use -fomit-frame-pointer
129 * - fixed a register-name typo in png_do_read_interlace(), default (MMX) case,
130 * pass == 4 or 5, that caused visible corruption of interlaced images
133 * - Various problems were reported with gcc 2.95.2 in the Cygwin environment,
134 * many of the form "forbidden register 0 (ax) was spilled for class AREG."
135 * This is explained at http://gcc.gnu.org/fom_serv/cache/23.html, and
136 * Chuck Wilson supplied a patch involving dummy output registers. See
137 * http://sourceforge.net/bugs/?func=detailbug&bug_id=108741&group_id=5624
138 * for the original (anonymous) SourceForge bug report.
141 * - Chuck Wilson passed along these remaining gcc 2.95.2 errors:
142 * pnggccrd.c: In function `png_combine_row':
143 * pnggccrd.c:525: more than 10 operands in `asm'
144 * pnggccrd.c:669: more than 10 operands in `asm'
145 * pnggccrd.c:828: more than 10 operands in `asm'
146 * pnggccrd.c:994: more than 10 operands in `asm'
147 * pnggccrd.c:1177: more than 10 operands in `asm'
148 * They are all the same problem and can be worked around by using the
149 * global _unmask variable unconditionally, not just in the -fPIC case.
150 * Reportedly earlier versions of gcc also have the problem with more than
151 * 10 operands; they just don't report it. Much strangeness ensues, etc.
154 * - enabled png_read_filter_row_mmx_up() (shortest remaining unconverted
155 * MMX routine); began converting png_read_filter_row_mmx_sub()
156 * - to finish remaining sections:
157 * - clean up indentation and comments
158 * - preload local variables
159 * - add output and input regs (order of former determines numerical
161 * - avoid all usage of ebx (including bx, bh, bl) register [20000823]
162 * - remove "$" from addressing of Shift and Mask variables [20000823]
165 * - global union vars causing segfaults in png_read_filter_row_mmx_sub()?
168 * - ARGH, stupid png_read_filter_row_mmx_sub() segfault only happens with
169 * shared-library (-fPIC) version! Code works just fine as part of static
170 * library. Damn damn damn damn damn, should have tested that sooner.
171 * ebx is getting clobbered again (explicitly this time); need to save it
172 * on stack or rewrite asm code to avoid using it altogether. Blargh!
175 * - first section was trickiest; all remaining sections have ebx -> edx now.
176 * (-fPIC works again.) Also added missing underscores to various Shift*
177 * and *Mask* globals and got rid of leading "$" signs.
180 * - added visual separators to help navigate microscopic printed copies
181 * (http://pobox.com/~newt/code/gpr-latest.zip, mode 10); started working
182 * on png_read_filter_row_mmx_avg()
185 * - finished png_read_filter_row_mmx_avg(): only Paeth left! (930 lines...)
186 * What the hell, did png_read_filter_row_mmx_paeth(), too. Comments not
187 * cleaned up/shortened in either routine, but functionality is complete
188 * and seems to be working fine.
191 * - ahhh, figured out last(?) bit of gcc/gas asm-fu: if register is listed
192 * as an input reg (with dummy output variables, etc.), then it *cannot*
193 * also appear in the clobber list or gcc 2.95.2 will barf. The solution
194 * is simple enough...
197 * - bug in png_read_filter_row_mmx_avg(): 16-bit grayscale not handled
198 * correctly (but 48-bit RGB just fine)
201 * - fixed bug in png_read_filter_row_mmx_avg(), bpp == 2 case; three errors:
202 * - "_ShiftBpp.use = 24;" should have been "_ShiftBpp.use = 16;"
203 * - "_ShiftRem.use = 40;" should have been "_ShiftRem.use = 48;"
204 * - "psllq _ShiftRem, %%mm2" should have been "psrlq _ShiftRem, %%mm2"
207 * - renamed mmxsupport() to png_mmx_support(), with auto-set of mmx_supported,
211 * - removed dependency on png_read_filter_row_c() (C code already duplicated
212 * within MMX version of png_read_filter_row()) so no longer necessary to
213 * compile it into pngrutil.o
216 * - test png_do_read_interlace() 64-bit case (pixel_bytes == 8)
217 * - write MMX code for 48-bit case (pixel_bytes == 6)
218 * - figure out what's up with 24-bit case (pixel_bytes == 3):
219 * why subtract 8 from width_mmx in the pass 4/5 case?
220 * (only width_mmx case) (near line 1606)
221 * - rewrite all MMX interlacing code so it's aligned with beginning
222 * of the row buffer, not the end (see 19991007 for details)
223 * x pick one version of mmxsupport() and get rid of the other
224 * - add error messages to any remaining bogus default cases
225 * - enable pixel_depth == 8 cases in png_read_filter_row()? (test speed)
226 * - add support for runtime enable/disable/query of various MMX routines
229 //#define PNG_DEBUG 2 // GRR
234 #if defined(PNG_ASSEMBLER_CODE_SUPPORTED) && defined(PNG_USE_PNGGCCRD)
236 #ifdef PNG_USE_LOCAL_ARRAYS
237 static const int FARDATA png_pass_start[7] = {0, 4, 0, 2, 0, 1, 0};
238 static const int FARDATA png_pass_inc[7] = {8, 8, 4, 4, 2, 2, 1};
239 static const int FARDATA png_pass_width[7] = {8, 4, 4, 2, 2, 1, 1};
242 // djgpp, Win32, and Cygwin add their own underscores to global variables,
243 // so define them without:
244 #if defined(__DJGPP__) || defined(WIN32) || defined(__CYGWIN__)
245 # define _mmx_supported mmx_supported
246 # define _unmask unmask
247 # define _const4 const4
248 # define _const6 const6
249 # define _mask8_0 mask8_0
250 # define _mask16_1 mask16_1
251 # define _mask16_0 mask16_0
252 # define _mask24_2 mask24_2
253 # define _mask24_1 mask24_1
254 # define _mask24_0 mask24_0
255 # define _mask32_3 mask32_3
256 # define _mask32_2 mask32_2
257 # define _mask32_1 mask32_1
258 # define _mask32_0 mask32_0
259 # define _mask48_5 mask48_5
260 # define _mask48_4 mask48_4
261 # define _mask48_3 mask48_3
262 # define _mask48_2 mask48_2
263 # define _mask48_1 mask48_1
264 # define _mask48_0 mask48_0
265 # define _FullLength FullLength
266 # define _MMXLength MMXLength
268 # define _LBCarryMask LBCarryMask
269 # define _HBClearMask HBClearMask
270 # define _ActiveMask ActiveMask
271 # define _ActiveMask2 ActiveMask2
272 # define _ActiveMaskEnd ActiveMaskEnd
273 # define _ShiftBpp ShiftBpp
274 # define _ShiftRem ShiftRem
275 # define _patemp patemp
276 # define _pbtemp pbtemp
277 # define _pctemp pctemp
280 static int _mmx_supported = 2;
282 /* These constants are used in the inlined MMX assembly code.
283 Ignore gcc's "At top level: defined but not used" warnings. */
285 /* GRR 20000706: originally _unmask was needed only when compiling with -fPIC,
286 * since that case uses the %ebx register for indexing the Global Offset Table
287 * and there were no other registers available. But gcc 2.95 and later emit
288 * "more than 10 operands in `asm'" errors when %ebx is used to preload unmask
289 * in the non-PIC case, so we'll just use the global unconditionally now.
293 static unsigned long long _mask8_0 = 0x0102040810204080LL;
295 static unsigned long long _mask16_1 = 0x0101020204040808LL;
296 static unsigned long long _mask16_0 = 0x1010202040408080LL;
298 static unsigned long long _mask24_2 = 0x0101010202020404LL;
299 static unsigned long long _mask24_1 = 0x0408080810101020LL;
300 static unsigned long long _mask24_0 = 0x2020404040808080LL;
302 static unsigned long long _mask32_3 = 0x0101010102020202LL;
303 static unsigned long long _mask32_2 = 0x0404040408080808LL;
304 static unsigned long long _mask32_1 = 0x1010101020202020LL;
305 static unsigned long long _mask32_0 = 0x4040404080808080LL;
307 static unsigned long long _mask48_5 = 0x0101010101010202LL;
308 static unsigned long long _mask48_4 = 0x0202020204040404LL;
309 static unsigned long long _mask48_3 = 0x0404080808080808LL;
310 static unsigned long long _mask48_2 = 0x1010101010102020LL;
311 static unsigned long long _mask48_1 = 0x2020202040404040LL;
312 static unsigned long long _mask48_0 = 0x4040808080808080LL;
314 static unsigned long long _const4 = 0x0000000000FFFFFFLL;
315 //static unsigned long long _const5 = 0x000000FFFFFF0000LL; // NOT USED
316 static unsigned long long _const6 = 0x00000000000000FFLL;
318 // These are used in the row-filter routines and should/would be local
319 // variables if not for gcc addressing limitations.
321 static png_uint_32 _FullLength;
322 static png_uint_32 _MMXLength;
324 static int _patemp; // temp variables for Paeth routine
331 //===========================================================================//
333 // P N G _ C O M B I N E _ R O W //
335 //===========================================================================//
337 #if defined(PNG_HAVE_ASSEMBLER_COMBINE_ROW)
339 /* Combines the row recently read in with the previous row.
340 This routine takes care of alpha and transparency if requested.
341 This routine also handles the two methods of progressive display
342 of interlaced images, depending on the mask value.
343 The mask value describes which pixels are to be combined with
344 the row. The pattern always repeats every 8 pixels, so just 8
345 bits are needed. A one indicates the pixel is to be combined; a
346 zero indicates the pixel is to be skipped. This is in addition
347 to any alpha or transparency value associated with the pixel.
348 If you want all pixels to be combined, pass 0xff (255) in mask. */
350 /* Use this routine for the x86 platform - it uses a faster MMX routine
351 if the machine supports MMX. */
354 png_combine_row(png_structp png_ptr, png_bytep row, int mask)
356 png_debug(1,"in png_combine_row_asm\n");
358 if (_mmx_supported == 2) {
364 png_memcpy(row, png_ptr->row_buf + 1,
365 (png_size_t)((png_ptr->width * png_ptr->row_info.pixel_depth + 7) >> 3));
367 /* GRR: png_combine_row() never called with mask == 0 */
370 switch (png_ptr->row_info.pixel_depth)
372 case 1: // png_ptr->row_info.pixel_depth
376 int s_inc, s_start, s_end;
381 sp = png_ptr->row_buf + 1;
384 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
385 if (png_ptr->transformations & PNG_PACKSWAP)
401 for (i = 0; i < png_ptr->width; i++)
407 value = (*sp >> shift) & 0x1;
408 *dp &= (png_byte)((0x7f7f >> (7 - shift)) & 0xff);
409 *dp |= (png_byte)(value << shift);
429 case 2: // png_ptr->row_info.pixel_depth
433 int s_start, s_end, s_inc;
439 sp = png_ptr->row_buf + 1;
442 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
443 if (png_ptr->transformations & PNG_PACKSWAP)
459 for (i = 0; i < png_ptr->width; i++)
463 value = (*sp >> shift) & 0x3;
464 *dp &= (png_byte)((0x3f3f >> (6 - shift)) & 0xff);
465 *dp |= (png_byte)(value << shift);
484 case 4: // png_ptr->row_info.pixel_depth
488 int s_start, s_end, s_inc;
494 sp = png_ptr->row_buf + 1;
497 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
498 if (png_ptr->transformations & PNG_PACKSWAP)
513 for (i = 0; i < png_ptr->width; i++)
517 value = (*sp >> shift) & 0xf;
518 *dp &= (png_byte)((0xf0f >> (4 - shift)) & 0xff);
519 *dp |= (png_byte)(value << shift);
538 case 8: // png_ptr->row_info.pixel_depth
543 if ( _mmx_supported )
547 int dummy_value_a; // fix 'forbidden register spilled' error
552 _unmask = ~mask; // global variable for -fPIC version
553 srcptr = png_ptr->row_buf + 1;
555 len = png_ptr->width &~7; // reduce to multiple of 8
556 diff = png_ptr->width & 7; // amount lost
558 __asm__ __volatile__ (
559 "movd _unmask, %%mm7 \n\t" // load bit pattern
560 "psubb %%mm6, %%mm6 \n\t" // zero mm6
561 "punpcklbw %%mm7, %%mm7 \n\t"
562 "punpcklwd %%mm7, %%mm7 \n\t"
563 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
565 "movq _mask8_0, %%mm0 \n\t"
566 "pand %%mm7, %%mm0 \n\t" // nonzero if keep byte
567 "pcmpeqb %%mm6, %%mm0 \n\t" // zeros->1s, v versa
569 // preload "movl len, %%ecx \n\t" // load length of line
570 // preload "movl srcptr, %%esi \n\t" // load source
571 // preload "movl dstptr, %%edi \n\t" // load dest
573 "cmpl $0, %%ecx \n\t" // len == 0 ?
574 "je mainloop8end \n\t"
577 "movq (%%esi), %%mm4 \n\t" // *srcptr
578 "pand %%mm0, %%mm4 \n\t"
579 "movq %%mm0, %%mm6 \n\t"
580 "pandn (%%edi), %%mm6 \n\t" // *dstptr
581 "por %%mm6, %%mm4 \n\t"
582 "movq %%mm4, (%%edi) \n\t"
583 "addl $8, %%esi \n\t" // inc by 8 bytes processed
584 "addl $8, %%edi \n\t"
585 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
589 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
590 "movl %%eax, %%ecx \n\t"
591 "cmpl $0, %%ecx \n\t"
593 // preload "movl mask, %%edx \n\t"
594 "sall $24, %%edx \n\t" // make low byte, high byte
597 "sall %%edx \n\t" // move high bit to CF
598 "jnc skip8 \n\t" // if CF = 0
599 "movb (%%esi), %%al \n\t"
600 "movb %%al, (%%edi) \n\t"
606 "jnz secondloop8 \n\t"
611 : "=a" (dummy_value_a), // output regs (dummy)
612 "=d" (dummy_value_d),
613 "=c" (dummy_value_c),
614 "=S" (dummy_value_S),
617 : "3" (srcptr), // esi // input regs
620 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
624 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
625 : "%mm0", "%mm4", "%mm6", "%mm7" // clobber list
629 else /* mmx _not supported - Use modified C routine */
631 register png_uint_32 i;
632 png_uint_32 initial_val = png_pass_start[png_ptr->pass];
633 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
634 register int stride = png_pass_inc[png_ptr->pass];
635 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
636 register int rep_bytes = png_pass_width[png_ptr->pass];
637 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
638 register png_uint_32 final_val = png_ptr->width;
640 srcptr = png_ptr->row_buf + 1 + initial_val;
641 dstptr = row + initial_val;
643 for (i = initial_val; i < final_val; i += stride)
645 png_memcpy(dstptr, srcptr, rep_bytes);
654 case 16: // png_ptr->row_info.pixel_depth
659 if ( _mmx_supported )
663 int dummy_value_a; // fix 'forbidden register spilled' error
668 _unmask = ~mask; // global variable for -fPIC version
669 srcptr = png_ptr->row_buf + 1;
671 len = png_ptr->width &~7; // reduce to multiple of 8
672 diff = png_ptr->width & 7; // amount lost
674 __asm__ __volatile__ (
675 "movd _unmask, %%mm7 \n\t" // load bit pattern
676 "psubb %%mm6, %%mm6 \n\t" // zero mm6
677 "punpcklbw %%mm7, %%mm7 \n\t"
678 "punpcklwd %%mm7, %%mm7 \n\t"
679 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
681 "movq _mask16_0, %%mm0 \n\t"
682 "movq _mask16_1, %%mm1 \n\t"
684 "pand %%mm7, %%mm0 \n\t"
685 "pand %%mm7, %%mm1 \n\t"
687 "pcmpeqb %%mm6, %%mm0 \n\t"
688 "pcmpeqb %%mm6, %%mm1 \n\t"
690 // preload "movl len, %%ecx \n\t" // load length of line
691 // preload "movl srcptr, %%esi \n\t" // load source
692 // preload "movl dstptr, %%edi \n\t" // load dest
694 "cmpl $0, %%ecx \n\t"
695 "jz mainloop16end \n\t"
698 "movq (%%esi), %%mm4 \n\t"
699 "pand %%mm0, %%mm4 \n\t"
700 "movq %%mm0, %%mm6 \n\t"
701 "movq (%%edi), %%mm7 \n\t"
702 "pandn %%mm7, %%mm6 \n\t"
703 "por %%mm6, %%mm4 \n\t"
704 "movq %%mm4, (%%edi) \n\t"
706 "movq 8(%%esi), %%mm5 \n\t"
707 "pand %%mm1, %%mm5 \n\t"
708 "movq %%mm1, %%mm7 \n\t"
709 "movq 8(%%edi), %%mm6 \n\t"
710 "pandn %%mm6, %%mm7 \n\t"
711 "por %%mm7, %%mm5 \n\t"
712 "movq %%mm5, 8(%%edi) \n\t"
714 "addl $16, %%esi \n\t" // inc by 16 bytes processed
715 "addl $16, %%edi \n\t"
716 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
719 "mainloop16end: \n\t"
720 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
721 "movl %%eax, %%ecx \n\t"
722 "cmpl $0, %%ecx \n\t"
724 // preload "movl mask, %%edx \n\t"
725 "sall $24, %%edx \n\t" // make low byte, high byte
728 "sall %%edx \n\t" // move high bit to CF
729 "jnc skip16 \n\t" // if CF = 0
730 "movw (%%esi), %%ax \n\t"
731 "movw %%ax, (%%edi) \n\t"
734 "addl $2, %%esi \n\t"
735 "addl $2, %%edi \n\t"
737 "jnz secondloop16 \n\t"
742 : "=a" (dummy_value_a), // output regs (dummy)
743 "=c" (dummy_value_c),
744 "=d" (dummy_value_d),
745 "=S" (dummy_value_S),
748 : "0" (diff), // eax // input regs
749 // was (unmask) " " RESERVED // ebx // Global Offset Table idx
755 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
756 : "%mm0", "%mm1", "%mm4" // clobber list
757 , "%mm5", "%mm6", "%mm7"
761 else /* mmx _not supported - Use modified C routine */
763 register png_uint_32 i;
764 png_uint_32 initial_val = 2 * png_pass_start[png_ptr->pass];
765 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
766 register int stride = 2 * png_pass_inc[png_ptr->pass];
767 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
768 register int rep_bytes = 2 * png_pass_width[png_ptr->pass];
769 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
770 register png_uint_32 final_val = 2 * png_ptr->width;
772 srcptr = png_ptr->row_buf + 1 + initial_val;
773 dstptr = row + initial_val;
775 for (i = initial_val; i < final_val; i += stride)
777 png_memcpy(dstptr, srcptr, rep_bytes);
786 case 24: // png_ptr->row_info.pixel_depth
791 if ( _mmx_supported )
795 int dummy_value_a; // fix 'forbidden register spilled' error
800 _unmask = ~mask; // global variable for -fPIC version
801 srcptr = png_ptr->row_buf + 1;
803 len = png_ptr->width &~7; // reduce to multiple of 8
804 diff = png_ptr->width & 7; // amount lost
806 __asm__ __volatile__ (
807 "movd _unmask, %%mm7 \n\t" // load bit pattern
808 "psubb %%mm6, %%mm6 \n\t" // zero mm6
809 "punpcklbw %%mm7, %%mm7 \n\t"
810 "punpcklwd %%mm7, %%mm7 \n\t"
811 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
813 "movq _mask24_0, %%mm0 \n\t"
814 "movq _mask24_1, %%mm1 \n\t"
815 "movq _mask24_2, %%mm2 \n\t"
817 "pand %%mm7, %%mm0 \n\t"
818 "pand %%mm7, %%mm1 \n\t"
819 "pand %%mm7, %%mm2 \n\t"
821 "pcmpeqb %%mm6, %%mm0 \n\t"
822 "pcmpeqb %%mm6, %%mm1 \n\t"
823 "pcmpeqb %%mm6, %%mm2 \n\t"
825 // preload "movl len, %%ecx \n\t" // load length of line
826 // preload "movl srcptr, %%esi \n\t" // load source
827 // preload "movl dstptr, %%edi \n\t" // load dest
829 "cmpl $0, %%ecx \n\t"
830 "jz mainloop24end \n\t"
833 "movq (%%esi), %%mm4 \n\t"
834 "pand %%mm0, %%mm4 \n\t"
835 "movq %%mm0, %%mm6 \n\t"
836 "movq (%%edi), %%mm7 \n\t"
837 "pandn %%mm7, %%mm6 \n\t"
838 "por %%mm6, %%mm4 \n\t"
839 "movq %%mm4, (%%edi) \n\t"
841 "movq 8(%%esi), %%mm5 \n\t"
842 "pand %%mm1, %%mm5 \n\t"
843 "movq %%mm1, %%mm7 \n\t"
844 "movq 8(%%edi), %%mm6 \n\t"
845 "pandn %%mm6, %%mm7 \n\t"
846 "por %%mm7, %%mm5 \n\t"
847 "movq %%mm5, 8(%%edi) \n\t"
849 "movq 16(%%esi), %%mm6 \n\t"
850 "pand %%mm2, %%mm6 \n\t"
851 "movq %%mm2, %%mm4 \n\t"
852 "movq 16(%%edi), %%mm7 \n\t"
853 "pandn %%mm7, %%mm4 \n\t"
854 "por %%mm4, %%mm6 \n\t"
855 "movq %%mm6, 16(%%edi) \n\t"
857 "addl $24, %%esi \n\t" // inc by 24 bytes processed
858 "addl $24, %%edi \n\t"
859 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
863 "mainloop24end: \n\t"
864 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
865 "movl %%eax, %%ecx \n\t"
866 "cmpl $0, %%ecx \n\t"
868 // preload "movl mask, %%edx \n\t"
869 "sall $24, %%edx \n\t" // make low byte, high byte
872 "sall %%edx \n\t" // move high bit to CF
873 "jnc skip24 \n\t" // if CF = 0
874 "movw (%%esi), %%ax \n\t"
875 "movw %%ax, (%%edi) \n\t"
876 "xorl %%eax, %%eax \n\t"
877 "movb 2(%%esi), %%al \n\t"
878 "movb %%al, 2(%%edi) \n\t"
881 "addl $3, %%esi \n\t"
882 "addl $3, %%edi \n\t"
884 "jnz secondloop24 \n\t"
889 : "=a" (dummy_value_a), // output regs (dummy)
890 "=d" (dummy_value_d),
891 "=c" (dummy_value_c),
892 "=S" (dummy_value_S),
895 : "3" (srcptr), // esi // input regs
898 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
902 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
903 : "%mm0", "%mm1", "%mm2" // clobber list
904 , "%mm4", "%mm5", "%mm6", "%mm7"
908 else /* mmx _not supported - Use modified C routine */
910 register png_uint_32 i;
911 png_uint_32 initial_val = 3 * png_pass_start[png_ptr->pass];
912 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
913 register int stride = 3 * png_pass_inc[png_ptr->pass];
914 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
915 register int rep_bytes = 3 * png_pass_width[png_ptr->pass];
916 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
917 register png_uint_32 final_val = 3 * png_ptr->width;
919 srcptr = png_ptr->row_buf + 1 + initial_val;
920 dstptr = row + initial_val;
922 for (i = initial_val; i < final_val; i += stride)
924 png_memcpy(dstptr, srcptr, rep_bytes);
933 case 32: // png_ptr->row_info.pixel_depth
938 if ( _mmx_supported )
942 int dummy_value_a; // fix 'forbidden register spilled' error
947 _unmask = ~mask; // global variable for -fPIC version
948 srcptr = png_ptr->row_buf + 1;
950 len = png_ptr->width &~7; // reduce to multiple of 8
951 diff = png_ptr->width & 7; // amount lost
953 __asm__ __volatile__ (
954 "movd _unmask, %%mm7 \n\t" // load bit pattern
955 "psubb %%mm6, %%mm6 \n\t" // zero mm6
956 "punpcklbw %%mm7, %%mm7 \n\t"
957 "punpcklwd %%mm7, %%mm7 \n\t"
958 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
960 "movq _mask32_0, %%mm0 \n\t"
961 "movq _mask32_1, %%mm1 \n\t"
962 "movq _mask32_2, %%mm2 \n\t"
963 "movq _mask32_3, %%mm3 \n\t"
965 "pand %%mm7, %%mm0 \n\t"
966 "pand %%mm7, %%mm1 \n\t"
967 "pand %%mm7, %%mm2 \n\t"
968 "pand %%mm7, %%mm3 \n\t"
970 "pcmpeqb %%mm6, %%mm0 \n\t"
971 "pcmpeqb %%mm6, %%mm1 \n\t"
972 "pcmpeqb %%mm6, %%mm2 \n\t"
973 "pcmpeqb %%mm6, %%mm3 \n\t"
975 // preload "movl len, %%ecx \n\t" // load length of line
976 // preload "movl srcptr, %%esi \n\t" // load source
977 // preload "movl dstptr, %%edi \n\t" // load dest
979 "cmpl $0, %%ecx \n\t" // lcr
980 "jz mainloop32end \n\t"
983 "movq (%%esi), %%mm4 \n\t"
984 "pand %%mm0, %%mm4 \n\t"
985 "movq %%mm0, %%mm6 \n\t"
986 "movq (%%edi), %%mm7 \n\t"
987 "pandn %%mm7, %%mm6 \n\t"
988 "por %%mm6, %%mm4 \n\t"
989 "movq %%mm4, (%%edi) \n\t"
991 "movq 8(%%esi), %%mm5 \n\t"
992 "pand %%mm1, %%mm5 \n\t"
993 "movq %%mm1, %%mm7 \n\t"
994 "movq 8(%%edi), %%mm6 \n\t"
995 "pandn %%mm6, %%mm7 \n\t"
996 "por %%mm7, %%mm5 \n\t"
997 "movq %%mm5, 8(%%edi) \n\t"
999 "movq 16(%%esi), %%mm6 \n\t"
1000 "pand %%mm2, %%mm6 \n\t"
1001 "movq %%mm2, %%mm4 \n\t"
1002 "movq 16(%%edi), %%mm7 \n\t"
1003 "pandn %%mm7, %%mm4 \n\t"
1004 "por %%mm4, %%mm6 \n\t"
1005 "movq %%mm6, 16(%%edi) \n\t"
1007 "movq 24(%%esi), %%mm7 \n\t"
1008 "pand %%mm3, %%mm7 \n\t"
1009 "movq %%mm3, %%mm5 \n\t"
1010 "movq 24(%%edi), %%mm4 \n\t"
1011 "pandn %%mm4, %%mm5 \n\t"
1012 "por %%mm5, %%mm7 \n\t"
1013 "movq %%mm7, 24(%%edi) \n\t"
1015 "addl $32, %%esi \n\t" // inc by 32 bytes processed
1016 "addl $32, %%edi \n\t"
1017 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1018 "ja mainloop32 \n\t"
1020 "mainloop32end: \n\t"
1021 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1022 "movl %%eax, %%ecx \n\t"
1023 "cmpl $0, %%ecx \n\t"
1025 // preload "movl mask, %%edx \n\t"
1026 "sall $24, %%edx \n\t" // low byte => high byte
1028 "secondloop32: \n\t"
1029 "sall %%edx \n\t" // move high bit to CF
1030 "jnc skip32 \n\t" // if CF = 0
1031 "movl (%%esi), %%eax \n\t"
1032 "movl %%eax, (%%edi) \n\t"
1035 "addl $4, %%esi \n\t"
1036 "addl $4, %%edi \n\t"
1038 "jnz secondloop32 \n\t"
1043 : "=a" (dummy_value_a), // output regs (dummy)
1044 "=d" (dummy_value_d),
1045 "=c" (dummy_value_c),
1046 "=S" (dummy_value_S),
1047 "=D" (dummy_value_D)
1049 : "3" (srcptr), // esi // input regs
1050 "4" (dstptr), // edi
1052 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1056 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1057 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1058 , "%mm4", "%mm5", "%mm6", "%mm7"
1062 else /* mmx _not supported - Use modified C routine */
1064 register png_uint_32 i;
1065 png_uint_32 initial_val = 4 * png_pass_start[png_ptr->pass];
1066 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1067 register int stride = 4 * png_pass_inc[png_ptr->pass];
1068 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1069 register int rep_bytes = 4 * png_pass_width[png_ptr->pass];
1070 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1071 register png_uint_32 final_val = 4 * png_ptr->width;
1073 srcptr = png_ptr->row_buf + 1 + initial_val;
1074 dstptr = row + initial_val;
1076 for (i = initial_val; i < final_val; i += stride)
1078 png_memcpy(dstptr, srcptr, rep_bytes);
1087 case 48: // png_ptr->row_info.pixel_depth
1092 if ( _mmx_supported )
1096 int dummy_value_a; // fix 'forbidden register spilled' error
1101 _unmask = ~mask; // global variable for -fPIC version
1102 srcptr = png_ptr->row_buf + 1;
1104 len = png_ptr->width &~7; // reduce to multiple of 8
1105 diff = png_ptr->width & 7; // amount lost
1107 __asm__ __volatile__ (
1108 "movd _unmask, %%mm7 \n\t" // load bit pattern
1109 "psubb %%mm6, %%mm6 \n\t" // zero mm6
1110 "punpcklbw %%mm7, %%mm7 \n\t"
1111 "punpcklwd %%mm7, %%mm7 \n\t"
1112 "punpckldq %%mm7, %%mm7 \n\t" // fill reg with 8 masks
1114 "movq _mask48_0, %%mm0 \n\t"
1115 "movq _mask48_1, %%mm1 \n\t"
1116 "movq _mask48_2, %%mm2 \n\t"
1117 "movq _mask48_3, %%mm3 \n\t"
1118 "movq _mask48_4, %%mm4 \n\t"
1119 "movq _mask48_5, %%mm5 \n\t"
1121 "pand %%mm7, %%mm0 \n\t"
1122 "pand %%mm7, %%mm1 \n\t"
1123 "pand %%mm7, %%mm2 \n\t"
1124 "pand %%mm7, %%mm3 \n\t"
1125 "pand %%mm7, %%mm4 \n\t"
1126 "pand %%mm7, %%mm5 \n\t"
1128 "pcmpeqb %%mm6, %%mm0 \n\t"
1129 "pcmpeqb %%mm6, %%mm1 \n\t"
1130 "pcmpeqb %%mm6, %%mm2 \n\t"
1131 "pcmpeqb %%mm6, %%mm3 \n\t"
1132 "pcmpeqb %%mm6, %%mm4 \n\t"
1133 "pcmpeqb %%mm6, %%mm5 \n\t"
1135 // preload "movl len, %%ecx \n\t" // load length of line
1136 // preload "movl srcptr, %%esi \n\t" // load source
1137 // preload "movl dstptr, %%edi \n\t" // load dest
1139 "cmpl $0, %%ecx \n\t"
1140 "jz mainloop48end \n\t"
1143 "movq (%%esi), %%mm7 \n\t"
1144 "pand %%mm0, %%mm7 \n\t"
1145 "movq %%mm0, %%mm6 \n\t"
1146 "pandn (%%edi), %%mm6 \n\t"
1147 "por %%mm6, %%mm7 \n\t"
1148 "movq %%mm7, (%%edi) \n\t"
1150 "movq 8(%%esi), %%mm6 \n\t"
1151 "pand %%mm1, %%mm6 \n\t"
1152 "movq %%mm1, %%mm7 \n\t"
1153 "pandn 8(%%edi), %%mm7 \n\t"
1154 "por %%mm7, %%mm6 \n\t"
1155 "movq %%mm6, 8(%%edi) \n\t"
1157 "movq 16(%%esi), %%mm6 \n\t"
1158 "pand %%mm2, %%mm6 \n\t"
1159 "movq %%mm2, %%mm7 \n\t"
1160 "pandn 16(%%edi), %%mm7 \n\t"
1161 "por %%mm7, %%mm6 \n\t"
1162 "movq %%mm6, 16(%%edi) \n\t"
1164 "movq 24(%%esi), %%mm7 \n\t"
1165 "pand %%mm3, %%mm7 \n\t"
1166 "movq %%mm3, %%mm6 \n\t"
1167 "pandn 24(%%edi), %%mm6 \n\t"
1168 "por %%mm6, %%mm7 \n\t"
1169 "movq %%mm7, 24(%%edi) \n\t"
1171 "movq 32(%%esi), %%mm6 \n\t"
1172 "pand %%mm4, %%mm6 \n\t"
1173 "movq %%mm4, %%mm7 \n\t"
1174 "pandn 32(%%edi), %%mm7 \n\t"
1175 "por %%mm7, %%mm6 \n\t"
1176 "movq %%mm6, 32(%%edi) \n\t"
1178 "movq 40(%%esi), %%mm7 \n\t"
1179 "pand %%mm5, %%mm7 \n\t"
1180 "movq %%mm5, %%mm6 \n\t"
1181 "pandn 40(%%edi), %%mm6 \n\t"
1182 "por %%mm6, %%mm7 \n\t"
1183 "movq %%mm7, 40(%%edi) \n\t"
1185 "addl $48, %%esi \n\t" // inc by 48 bytes processed
1186 "addl $48, %%edi \n\t"
1187 "subl $8, %%ecx \n\t" // dec by 8 pixels processed
1189 "ja mainloop48 \n\t"
1191 "mainloop48end: \n\t"
1192 // preload "movl diff, %%ecx \n\t" // (diff is in eax)
1193 "movl %%eax, %%ecx \n\t"
1194 "cmpl $0, %%ecx \n\t"
1196 // preload "movl mask, %%edx \n\t"
1197 "sall $24, %%edx \n\t" // make low byte, high byte
1199 "secondloop48: \n\t"
1200 "sall %%edx \n\t" // move high bit to CF
1201 "jnc skip48 \n\t" // if CF = 0
1202 "movl (%%esi), %%eax \n\t"
1203 "movl %%eax, (%%edi) \n\t"
1206 "addl $4, %%esi \n\t"
1207 "addl $4, %%edi \n\t"
1209 "jnz secondloop48 \n\t"
1214 : "=a" (dummy_value_a), // output regs (dummy)
1215 "=d" (dummy_value_d),
1216 "=c" (dummy_value_c),
1217 "=S" (dummy_value_S),
1218 "=D" (dummy_value_D)
1220 : "3" (srcptr), // esi // input regs
1221 "4" (dstptr), // edi
1223 // was (unmask) "b" RESERVED // ebx // Global Offset Table idx
1227 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
1228 : "%mm0", "%mm1", "%mm2", "%mm3" // clobber list
1229 , "%mm4", "%mm5", "%mm6", "%mm7"
1233 else /* mmx _not supported - Use modified C routine */
1235 register png_uint_32 i;
1236 png_uint_32 initial_val = 6 * png_pass_start[png_ptr->pass];
1237 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1238 register int stride = 6 * png_pass_inc[png_ptr->pass];
1239 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1240 register int rep_bytes = 6 * png_pass_width[png_ptr->pass];
1241 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1242 register png_uint_32 final_val = 6 * png_ptr->width;
1244 srcptr = png_ptr->row_buf + 1 + initial_val;
1245 dstptr = row + initial_val;
1247 for (i = initial_val; i < final_val; i += stride)
1249 png_memcpy(dstptr, srcptr, rep_bytes);
1258 case 64: // png_ptr->row_info.pixel_depth
1262 register png_uint_32 i;
1263 png_uint_32 initial_val = 8 * png_pass_start[png_ptr->pass];
1264 // png.c: png_pass_start[] = {0, 4, 0, 2, 0, 1, 0};
1265 register int stride = 8 * png_pass_inc[png_ptr->pass];
1266 // png.c: png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1267 register int rep_bytes = 8 * png_pass_width[png_ptr->pass];
1268 // png.c: png_pass_width[] = {8, 4, 4, 2, 2, 1, 1};
1269 register png_uint_32 final_val = 8 * png_ptr->width;
1271 srcptr = png_ptr->row_buf + 1 + initial_val;
1272 dstptr = row + initial_val;
1274 for (i = initial_val; i < final_val; i += stride)
1276 png_memcpy(dstptr, srcptr, rep_bytes);
1283 default: // png_ptr->row_info.pixel_depth != 1,2,4,8,16,24,32,48,64
1285 // this should never happen
1287 "libpng internal error: png_ptr->row_info.pixel_depth = %d\n",
1288 png_ptr->row_info.pixel_depth);
1292 } /* end switch (png_ptr->row_info.pixel_depth) */
1294 } /* end if (non-trivial mask) */
1296 } /* end png_combine_row() */
1298 #endif /* PNG_HAVE_ASSEMBLER_COMBINE_ROW */
1303 //===========================================================================//
1305 // P N G _ D O _ R E A D _ I N T E R L A C E //
1307 //===========================================================================//
1309 #if defined(PNG_READ_INTERLACING_SUPPORTED)
1310 #if defined(PNG_HAVE_ASSEMBLER_READ_INTERLACE)
1312 /* png_do_read_interlace() is called after any 16-bit to 8-bit conversion
1313 * has taken place. [GRR: what other steps come before and/or after?]
1317 png_do_read_interlace(png_structp png_ptr)
1319 png_row_infop row_info = &(png_ptr->row_info);
1320 png_bytep row = png_ptr->row_buf + 1;
1321 int pass = png_ptr->pass;
1322 png_uint_32 transformations = png_ptr->transformations;
1324 png_debug(1,"in png_do_read_interlace\n");
1326 if (_mmx_supported == 2) {
1330 if (row != NULL && row_info != NULL)
1332 png_uint_32 final_width;
1334 final_width = row_info->width * png_pass_inc[pass];
1336 switch (row_info->pixel_depth)
1342 int s_start, s_end, s_inc;
1347 sp = row + (png_size_t)((row_info->width - 1) >> 3);
1348 dp = row + (png_size_t)((final_width - 1) >> 3);
1349 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1350 if (transformations & PNG_PACKSWAP)
1352 sshift = (int)((row_info->width + 7) & 7);
1353 dshift = (int)((final_width + 7) & 7);
1361 sshift = 7 - (int)((row_info->width + 7) & 7);
1362 dshift = 7 - (int)((final_width + 7) & 7);
1368 for (i = row_info->width; i; i--)
1370 v = (png_byte)((*sp >> sshift) & 0x1);
1371 for (j = 0; j < png_pass_inc[pass]; j++)
1373 *dp &= (png_byte)((0x7f7f >> (7 - dshift)) & 0xff);
1374 *dp |= (png_byte)(v << dshift);
1375 if (dshift == s_end)
1383 if (sshift == s_end)
1398 int s_start, s_end, s_inc;
1401 sp = row + (png_size_t)((row_info->width - 1) >> 2);
1402 dp = row + (png_size_t)((final_width - 1) >> 2);
1403 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1404 if (transformations & PNG_PACKSWAP)
1406 sshift = (png_size_t)(((row_info->width + 3) & 3) << 1);
1407 dshift = (png_size_t)(((final_width + 3) & 3) << 1);
1415 sshift = (png_size_t)((3 - ((row_info->width + 3) & 3)) << 1);
1416 dshift = (png_size_t)((3 - ((final_width + 3) & 3)) << 1);
1422 for (i = row_info->width; i; i--)
1427 v = (png_byte)((*sp >> sshift) & 0x3);
1428 for (j = 0; j < png_pass_inc[pass]; j++)
1430 *dp &= (png_byte)((0x3f3f >> (6 - dshift)) & 0xff);
1431 *dp |= (png_byte)(v << dshift);
1432 if (dshift == s_end)
1440 if (sshift == s_end)
1455 int s_start, s_end, s_inc;
1458 sp = row + (png_size_t)((row_info->width - 1) >> 1);
1459 dp = row + (png_size_t)((final_width - 1) >> 1);
1460 #if defined(PNG_READ_PACKSWAP_SUPPORTED)
1461 if (transformations & PNG_PACKSWAP)
1463 sshift = (png_size_t)(((row_info->width + 1) & 1) << 2);
1464 dshift = (png_size_t)(((final_width + 1) & 1) << 2);
1472 sshift = (png_size_t)((1 - ((row_info->width + 1) & 1)) << 2);
1473 dshift = (png_size_t)((1 - ((final_width + 1) & 1)) << 2);
1479 for (i = row_info->width; i; i--)
1484 v = (png_byte)((*sp >> sshift) & 0xf);
1485 for (j = 0; j < png_pass_inc[pass]; j++)
1487 *dp &= (png_byte)((0xf0f >> (4 - dshift)) & 0xff);
1488 *dp |= (png_byte)(v << dshift);
1489 if (dshift == s_end)
1497 if (sshift == s_end)
1508 //====================================================================
1510 default: // 8-bit or larger (this is where the routine is modified)
1512 // static unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1513 // static unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1514 // unsigned long long _const4 = 0x0000000000FFFFFFLL; no good
1515 // unsigned long long const4 = 0x0000000000FFFFFFLL; no good
1518 png_size_t pixel_bytes;
1519 int width = row_info->width;
1521 pixel_bytes = (row_info->pixel_depth >> 3);
1523 // point sptr at the last pixel in the pre-expanded row:
1524 sptr = row + (width - 1) * pixel_bytes;
1526 // point dp at the last pixel position in the expanded row:
1527 dp = row + (final_width - 1) * pixel_bytes;
1529 // New code by Nirav Chhatrapati - Intel Corporation
1531 if ( _mmx_supported )
1533 //--------------------------------------------------------------
1534 if (pixel_bytes == 3)
1536 if (((pass == 0) || (pass == 1)) && width)
1538 int dummy_value_c; // fix 'forbidden register spilled'
1542 __asm__ __volatile__ (
1543 "subl $21, %%edi \n\t"
1544 // (png_pass_inc[pass] - 1)*pixel_bytes
1546 ".loop3_pass0: \n\t"
1547 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1548 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1549 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1550 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1551 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1552 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1553 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1554 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1555 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1556 "movq %%mm0, %%mm3 \n\t" // 2 1 0 2 1 0 2 1
1557 "psllq $16, %%mm0 \n\t" // 0 2 1 0 2 1 z z
1558 "movq %%mm3, %%mm4 \n\t" // 2 1 0 2 1 0 2 1
1559 "punpckhdq %%mm0, %%mm3 \n\t" // 0 2 1 0 2 1 0 2
1560 "movq %%mm4, 16(%%edi) \n\t"
1561 "psrlq $32, %%mm0 \n\t" // z z z z 0 2 1 0
1562 "movq %%mm3, 8(%%edi) \n\t"
1563 "punpckldq %%mm4, %%mm0 \n\t" // 1 0 2 1 0 2 1 0
1564 "subl $3, %%esi \n\t"
1565 "movq %%mm0, (%%edi) \n\t"
1566 "subl $24, %%edi \n\t"
1568 "jnz .loop3_pass0 \n\t"
1571 : "=c" (dummy_value_c), // output regs (dummy)
1572 "=S" (dummy_value_S),
1573 "=D" (dummy_value_D)
1575 : "1" (sptr), // esi // input regs
1578 // doesn't work "i" (0x0000000000FFFFFFLL) // %1 (a.k.a. _const4)
1580 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1581 : "%mm0", "%mm1", "%mm2" // clobber list
1586 else if (((pass == 2) || (pass == 3)) && width)
1588 int dummy_value_c; // fix 'forbidden register spilled'
1592 __asm__ __volatile__ (
1593 "subl $9, %%edi \n\t"
1594 // (png_pass_inc[pass] - 1)*pixel_bytes
1596 ".loop3_pass2: \n\t"
1597 "movd (%%esi), %%mm0 \n\t" // x x x x x 2 1 0
1598 "pand _const4, %%mm0 \n\t" // z z z z z 2 1 0
1599 "movq %%mm0, %%mm1 \n\t" // z z z z z 2 1 0
1600 "psllq $16, %%mm0 \n\t" // z z z 2 1 0 z z
1601 "movq %%mm0, %%mm2 \n\t" // z z z 2 1 0 z z
1602 "psllq $24, %%mm0 \n\t" // 2 1 0 z z z z z
1603 "psrlq $8, %%mm1 \n\t" // z z z z z z 2 1
1604 "por %%mm2, %%mm0 \n\t" // 2 1 0 2 1 0 z z
1605 "por %%mm1, %%mm0 \n\t" // 2 1 0 2 1 0 2 1
1606 "movq %%mm0, 4(%%edi) \n\t"
1607 "psrlq $16, %%mm0 \n\t" // z z 2 1 0 2 1 0
1608 "subl $3, %%esi \n\t"
1609 "movd %%mm0, (%%edi) \n\t"
1610 "subl $12, %%edi \n\t"
1612 "jnz .loop3_pass2 \n\t"
1615 : "=c" (dummy_value_c), // output regs (dummy)
1616 "=S" (dummy_value_S),
1617 "=D" (dummy_value_D)
1619 : "1" (sptr), // esi // input regs
1623 #if 0 /* %mm0, ..., %mm2 not supported by gcc 2.7.2.3 or egcs 1.1 */
1624 : "%mm0", "%mm1", "%mm2" // clobber list
1628 else if (width) /* && ((pass == 4) || (pass == 5)) */
1630 int width_mmx = ((width >> 1) << 1) - 8; // GRR: huh?
1633 width -= width_mmx; // 8 or 9 pix, 24 or 27 bytes
1636 // png_pass_inc[] = {8, 8, 4, 4, 2, 2, 1};
1637 // sptr points at last pixel in pre-expanded row
1638 // dp points at last pixel position in expanded row
1639 int dummy_value_c; // fix 'forbidden register spilled'
1643 __asm__ __volatile__ (
1644 "subl $3, %%esi \n\t"
1645 "subl $9, %%edi \n\t"
1646 // (png_pass_inc[pass] + 1)*pixel_bytes
1648 ".loop3_pass4: \n\t"
1649 "movq (%%esi), %%mm0 \n\t" // x x 5 4 3 2 1 0
1650 "movq %%mm0, %%mm1 \n\t" // x x 5 4 3 2 1 0
1651 "movq %%mm0, %%mm2 \n\t" // x x 5 4 3 2 1 0
1652 "psllq $24, %%mm0 \n\t" // 4 3 2 1 0 z z z
1653 "pand _const4, %%mm1 \n\t" // z z z z z 2 1 0
1654 "psrlq $24, %%mm2 \n\t" // z z z x x 5 4 3
1655 "por %%mm1, %%mm0 \n\t" // 4 3 2 1 0 2 1 0
1656 "movq %%mm2, %%mm3 \n\t" // z z z x x 5 4 3
1657 "psllq $8, %%mm2 \n\t" // z z x x 5 4 3 z
1658 "movq %%mm0, (%%edi) \n\t"
1659 "psrlq $16, %%mm3 \n\t" // z z z z z x x 5
1660 "pand _const6, %%mm3 \n\t" // z z z z z z z 5
1661 "por %%mm3, %%mm2 \n\t" // z z x x 5 4 3 5
1662 "subl $6, %%esi \n\t"
1663 "movd %%mm2, 8(%%edi) \n\t"
1664 "subl $12, %%edi \n\t"
1665 "subl $2, %%ecx \n\t"
1666 "jnz .loop3_pass4 \n\t"
1669 : "=c" (dummy_value_c), // output regs (dummy)
1670 "=S" (dummy_value_S),
1671 "=D" (dummy_value_D)
1673 : "1" (sptr), // esi // input regs
1675 "0" (width_mmx) // ecx
1677 #if 0 /* %mm0, ..., %mm3 not supported by gcc 2.7.2.3 or egcs 1.1 */
1678 : "%mm0", "%mm1" // clobber list
1684 sptr -= width_mmx*3;
1686 for (i = width; i; i--)
1691 png_memcpy(v, sptr, 3);
1692 for (j = 0; j < png_pass_inc[pass]; j++)
1694 png_memcpy(dp, v, 3);
1700 } /* end of pixel_bytes == 3 */
1702 //--------------------------------------------------------------
1703 else if (pixel_bytes == 1)
1705 if (((pass == 0) || (pass == 1)) && width)
1707 int width_mmx = ((width >> 2) << 2);
1708 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1711 int dummy_value_c; // fix 'forbidden register spilled'
1715 __asm__ __volatile__ (
1716 "subl $3, %%esi \n\t"
1717 "subl $31, %%edi \n\t"
1719 ".loop1_pass0: \n\t"
1720 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1721 "movq %%mm0, %%mm1 \n\t" // x x x x 3 2 1 0
1722 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1723 "movq %%mm0, %%mm2 \n\t" // 3 3 2 2 1 1 0 0
1724 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1725 "movq %%mm0, %%mm3 \n\t" // 1 1 1 1 0 0 0 0
1726 "punpckldq %%mm0, %%mm0 \n\t" // 0 0 0 0 0 0 0 0
1727 "punpckhdq %%mm3, %%mm3 \n\t" // 1 1 1 1 1 1 1 1
1728 "movq %%mm0, (%%edi) \n\t"
1729 "punpckhwd %%mm2, %%mm2 \n\t" // 3 3 3 3 2 2 2 2
1730 "movq %%mm3, 8(%%edi) \n\t"
1731 "movq %%mm2, %%mm4 \n\t" // 3 3 3 3 2 2 2 2
1732 "punpckldq %%mm2, %%mm2 \n\t" // 2 2 2 2 2 2 2 2
1733 "punpckhdq %%mm4, %%mm4 \n\t" // 3 3 3 3 3 3 3 3
1734 "movq %%mm2, 16(%%edi) \n\t"
1735 "subl $4, %%esi \n\t"
1736 "movq %%mm4, 24(%%edi) \n\t"
1737 "subl $32, %%edi \n\t"
1738 "subl $4, %%ecx \n\t"
1739 "jnz .loop1_pass0 \n\t"
1742 : "=c" (dummy_value_c), // output regs (dummy)
1743 "=S" (dummy_value_S),
1744 "=D" (dummy_value_D)
1746 : "1" (sptr), // esi // input regs
1748 "0" (width_mmx) // ecx
1750 #if 0 /* %mm0, ..., %mm4 not supported by gcc 2.7.2.3 or egcs 1.1 */
1751 : "%mm0", "%mm1", "%mm2" // clobber list
1759 for (i = width; i; i--)
1763 /* I simplified this part in version 1.0.4e
1764 * here and in several other instances where
1765 * pixel_bytes == 1 -- GR-P
1770 * png_memcpy(v, sptr, pixel_bytes);
1771 * for (j = 0; j < png_pass_inc[pass]; j++)
1773 * png_memcpy(dp, v, pixel_bytes);
1774 * dp -= pixel_bytes;
1776 * sptr -= pixel_bytes;
1778 * Replacement code is in the next three lines:
1781 for (j = 0; j < png_pass_inc[pass]; j++)
1786 else if (((pass == 2) || (pass == 3)) && width)
1788 int width_mmx = ((width >> 2) << 2);
1789 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1792 int dummy_value_c; // fix 'forbidden register spilled'
1796 __asm__ __volatile__ (
1797 "subl $3, %%esi \n\t"
1798 "subl $15, %%edi \n\t"
1800 ".loop1_pass2: \n\t"
1801 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1802 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1803 "movq %%mm0, %%mm1 \n\t" // 3 3 2 2 1 1 0 0
1804 "punpcklwd %%mm0, %%mm0 \n\t" // 1 1 1 1 0 0 0 0
1805 "punpckhwd %%mm1, %%mm1 \n\t" // 3 3 3 3 2 2 2 2
1806 "movq %%mm0, (%%edi) \n\t"
1807 "subl $4, %%esi \n\t"
1808 "movq %%mm1, 8(%%edi) \n\t"
1809 "subl $16, %%edi \n\t"
1810 "subl $4, %%ecx \n\t"
1811 "jnz .loop1_pass2 \n\t"
1814 : "=c" (dummy_value_c), // output regs (dummy)
1815 "=S" (dummy_value_S),
1816 "=D" (dummy_value_D)
1818 : "1" (sptr), // esi // input regs
1820 "0" (width_mmx) // ecx
1822 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1823 : "%mm0", "%mm1" // clobber list
1830 for (i = width; i; i--)
1834 for (j = 0; j < png_pass_inc[pass]; j++)
1839 else if (width) /* && ((pass == 4) || (pass == 5)) */
1841 int width_mmx = ((width >> 3) << 3);
1842 width -= width_mmx; // 0-3 pixels => 0-3 bytes
1845 int dummy_value_c; // fix 'forbidden register spilled'
1849 __asm__ __volatile__ (
1850 "subl $7, %%esi \n\t"
1851 "subl $15, %%edi \n\t"
1853 ".loop1_pass4: \n\t"
1854 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
1855 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
1856 "punpcklbw %%mm0, %%mm0 \n\t" // 3 3 2 2 1 1 0 0
1857 "punpckhbw %%mm1, %%mm1 \n\t" // 7 7 6 6 5 5 4 4
1858 "movq %%mm1, 8(%%edi) \n\t"
1859 "subl $8, %%esi \n\t"
1860 "movq %%mm0, (%%edi) \n\t"
1861 "subl $16, %%edi \n\t"
1862 "subl $8, %%ecx \n\t"
1863 "jnz .loop1_pass4 \n\t"
1866 : "=c" (dummy_value_c), // output regs (none)
1867 "=S" (dummy_value_S),
1868 "=D" (dummy_value_D)
1870 : "1" (sptr), // esi // input regs
1872 "0" (width_mmx) // ecx
1874 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1875 : "%mm0", "%mm1" // clobber list
1882 for (i = width; i; i--)
1886 for (j = 0; j < png_pass_inc[pass]; j++)
1891 } /* end of pixel_bytes == 1 */
1893 //--------------------------------------------------------------
1894 else if (pixel_bytes == 2)
1896 if (((pass == 0) || (pass == 1)) && width)
1898 int width_mmx = ((width >> 1) << 1);
1899 width -= width_mmx; // 0,1 pixels => 0,2 bytes
1902 int dummy_value_c; // fix 'forbidden register spilled'
1906 __asm__ __volatile__ (
1907 "subl $2, %%esi \n\t"
1908 "subl $30, %%edi \n\t"
1910 ".loop2_pass0: \n\t"
1911 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1912 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1913 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1914 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1915 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1916 "movq %%mm0, (%%edi) \n\t"
1917 "movq %%mm0, 8(%%edi) \n\t"
1918 "movq %%mm1, 16(%%edi) \n\t"
1919 "subl $4, %%esi \n\t"
1920 "movq %%mm1, 24(%%edi) \n\t"
1921 "subl $32, %%edi \n\t"
1922 "subl $2, %%ecx \n\t"
1923 "jnz .loop2_pass0 \n\t"
1926 : "=c" (dummy_value_c), // output regs (dummy)
1927 "=S" (dummy_value_S),
1928 "=D" (dummy_value_D)
1930 : "1" (sptr), // esi // input regs
1932 "0" (width_mmx) // ecx
1934 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1935 : "%mm0", "%mm1" // clobber list
1940 sptr -= (width_mmx*2 - 2); // sign fixed
1941 dp -= (width_mmx*16 - 2); // sign fixed
1942 for (i = width; i; i--)
1947 png_memcpy(v, sptr, 2);
1948 for (j = 0; j < png_pass_inc[pass]; j++)
1951 png_memcpy(dp, v, 2);
1955 else if (((pass == 2) || (pass == 3)) && width)
1957 int width_mmx = ((width >> 1) << 1) ;
1958 width -= width_mmx; // 0,1 pixels => 0,2 bytes
1961 int dummy_value_c; // fix 'forbidden register spilled'
1965 __asm__ __volatile__ (
1966 "subl $2, %%esi \n\t"
1967 "subl $14, %%edi \n\t"
1969 ".loop2_pass2: \n\t"
1970 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
1971 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
1972 "movq %%mm0, %%mm1 \n\t" // 3 2 3 2 1 0 1 0
1973 "punpckldq %%mm0, %%mm0 \n\t" // 1 0 1 0 1 0 1 0
1974 "punpckhdq %%mm1, %%mm1 \n\t" // 3 2 3 2 3 2 3 2
1975 "movq %%mm0, (%%edi) \n\t"
1976 "subl $4, %%esi \n\t"
1977 "movq %%mm1, 8(%%edi) \n\t"
1978 "subl $16, %%edi \n\t"
1979 "subl $2, %%ecx \n\t"
1980 "jnz .loop2_pass2 \n\t"
1983 : "=c" (dummy_value_c), // output regs (dummy)
1984 "=S" (dummy_value_S),
1985 "=D" (dummy_value_D)
1987 : "1" (sptr), // esi // input regs
1989 "0" (width_mmx) // ecx
1991 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
1992 : "%mm0", "%mm1" // clobber list
1997 sptr -= (width_mmx*2 - 2); // sign fixed
1998 dp -= (width_mmx*8 - 2); // sign fixed
1999 for (i = width; i; i--)
2004 png_memcpy(v, sptr, 2);
2005 for (j = 0; j < png_pass_inc[pass]; j++)
2008 png_memcpy(dp, v, 2);
2012 else if (width) // pass == 4 or 5
2014 int width_mmx = ((width >> 1) << 1) ;
2015 width -= width_mmx; // 0,1 pixels => 0,2 bytes
2018 int dummy_value_c; // fix 'forbidden register spilled'
2022 __asm__ __volatile__ (
2023 "subl $2, %%esi \n\t"
2024 "subl $6, %%edi \n\t"
2026 ".loop2_pass4: \n\t"
2027 "movd (%%esi), %%mm0 \n\t" // x x x x 3 2 1 0
2028 "punpcklwd %%mm0, %%mm0 \n\t" // 3 2 3 2 1 0 1 0
2029 "subl $4, %%esi \n\t"
2030 "movq %%mm0, (%%edi) \n\t"
2031 "subl $8, %%edi \n\t"
2032 "subl $2, %%ecx \n\t"
2033 "jnz .loop2_pass4 \n\t"
2036 : "=c" (dummy_value_c), // output regs (dummy)
2037 "=S" (dummy_value_S),
2038 "=D" (dummy_value_D)
2040 : "1" (sptr), // esi // input regs
2042 "0" (width_mmx) // ecx
2044 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2045 : "%mm0" // clobber list
2050 sptr -= (width_mmx*2 - 2); // sign fixed
2051 dp -= (width_mmx*4 - 2); // sign fixed
2052 for (i = width; i; i--)
2057 png_memcpy(v, sptr, 2);
2058 for (j = 0; j < png_pass_inc[pass]; j++)
2061 png_memcpy(dp, v, 2);
2065 } /* end of pixel_bytes == 2 */
2067 //--------------------------------------------------------------
2068 else if (pixel_bytes == 4)
2070 if (((pass == 0) || (pass == 1)) && width)
2072 int width_mmx = ((width >> 1) << 1);
2073 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2076 int dummy_value_c; // fix 'forbidden register spilled'
2080 __asm__ __volatile__ (
2081 "subl $4, %%esi \n\t"
2082 "subl $60, %%edi \n\t"
2084 ".loop4_pass0: \n\t"
2085 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2086 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2087 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2088 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2089 "movq %%mm0, (%%edi) \n\t"
2090 "movq %%mm0, 8(%%edi) \n\t"
2091 "movq %%mm0, 16(%%edi) \n\t"
2092 "movq %%mm0, 24(%%edi) \n\t"
2093 "movq %%mm1, 32(%%edi) \n\t"
2094 "movq %%mm1, 40(%%edi) \n\t"
2095 "movq %%mm1, 48(%%edi) \n\t"
2096 "subl $8, %%esi \n\t"
2097 "movq %%mm1, 56(%%edi) \n\t"
2098 "subl $64, %%edi \n\t"
2099 "subl $2, %%ecx \n\t"
2100 "jnz .loop4_pass0 \n\t"
2103 : "=c" (dummy_value_c), // output regs (dummy)
2104 "=S" (dummy_value_S),
2105 "=D" (dummy_value_D)
2107 : "1" (sptr), // esi // input regs
2109 "0" (width_mmx) // ecx
2111 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2112 : "%mm0", "%mm1" // clobber list
2117 sptr -= (width_mmx*4 - 4); // sign fixed
2118 dp -= (width_mmx*32 - 4); // sign fixed
2119 for (i = width; i; i--)
2124 png_memcpy(v, sptr, 4);
2125 for (j = 0; j < png_pass_inc[pass]; j++)
2128 png_memcpy(dp, v, 4);
2132 else if (((pass == 2) || (pass == 3)) && width)
2134 int width_mmx = ((width >> 1) << 1);
2135 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2138 int dummy_value_c; // fix 'forbidden register spilled'
2142 __asm__ __volatile__ (
2143 "subl $4, %%esi \n\t"
2144 "subl $28, %%edi \n\t"
2146 ".loop4_pass2: \n\t"
2147 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2148 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2149 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2150 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2151 "movq %%mm0, (%%edi) \n\t"
2152 "movq %%mm0, 8(%%edi) \n\t"
2153 "movq %%mm1, 16(%%edi) \n\t"
2154 "movq %%mm1, 24(%%edi) \n\t"
2155 "subl $8, %%esi \n\t"
2156 "subl $32, %%edi \n\t"
2157 "subl $2, %%ecx \n\t"
2158 "jnz .loop4_pass2 \n\t"
2161 : "=c" (dummy_value_c), // output regs (dummy)
2162 "=S" (dummy_value_S),
2163 "=D" (dummy_value_D)
2165 : "1" (sptr), // esi // input regs
2167 "0" (width_mmx) // ecx
2169 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2170 : "%mm0", "%mm1" // clobber list
2175 sptr -= (width_mmx*4 - 4); // sign fixed
2176 dp -= (width_mmx*16 - 4); // sign fixed
2177 for (i = width; i; i--)
2182 png_memcpy(v, sptr, 4);
2183 for (j = 0; j < png_pass_inc[pass]; j++)
2186 png_memcpy(dp, v, 4);
2190 else if (width) // pass == 4 or 5
2192 int width_mmx = ((width >> 1) << 1) ;
2193 width -= width_mmx; // 0,1 pixels => 0,4 bytes
2196 int dummy_value_c; // fix 'forbidden register spilled'
2200 __asm__ __volatile__ (
2201 "subl $4, %%esi \n\t"
2202 "subl $12, %%edi \n\t"
2204 ".loop4_pass4: \n\t"
2205 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2206 "movq %%mm0, %%mm1 \n\t" // 7 6 5 4 3 2 1 0
2207 "punpckldq %%mm0, %%mm0 \n\t" // 3 2 1 0 3 2 1 0
2208 "punpckhdq %%mm1, %%mm1 \n\t" // 7 6 5 4 7 6 5 4
2209 "movq %%mm0, (%%edi) \n\t"
2210 "subl $8, %%esi \n\t"
2211 "movq %%mm1, 8(%%edi) \n\t"
2212 "subl $16, %%edi \n\t"
2213 "subl $2, %%ecx \n\t"
2214 "jnz .loop4_pass4 \n\t"
2217 : "=c" (dummy_value_c), // output regs (dummy)
2218 "=S" (dummy_value_S),
2219 "=D" (dummy_value_D)
2221 : "1" (sptr), // esi // input regs
2223 "0" (width_mmx) // ecx
2225 #if 0 /* %mm0, %mm1 not supported by gcc 2.7.2.3 or egcs 1.1 */
2226 : "%mm0", "%mm1" // clobber list
2231 sptr -= (width_mmx*4 - 4); // sign fixed
2232 dp -= (width_mmx*8 - 4); // sign fixed
2233 for (i = width; i; i--)
2238 png_memcpy(v, sptr, 4);
2239 for (j = 0; j < png_pass_inc[pass]; j++)
2242 png_memcpy(dp, v, 4);
2246 } /* end of pixel_bytes == 4 */
2248 //--------------------------------------------------------------
2249 else if (pixel_bytes == 8)
2251 // GRR TEST: should work, but needs testing (special 64-bit version of rpng2?)
2252 // GRR NOTE: no need to combine passes here!
2253 if (((pass == 0) || (pass == 1)) && width)
2255 int dummy_value_c; // fix 'forbidden register spilled'
2259 // source is 8-byte RRGGBBAA
2260 // dest is 64-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA ...
2261 __asm__ __volatile__ (
2262 "subl $56, %%edi \n\t" // start of last block
2264 ".loop8_pass0: \n\t"
2265 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2266 "movq %%mm0, (%%edi) \n\t"
2267 "movq %%mm0, 8(%%edi) \n\t"
2268 "movq %%mm0, 16(%%edi) \n\t"
2269 "movq %%mm0, 24(%%edi) \n\t"
2270 "movq %%mm0, 32(%%edi) \n\t"
2271 "movq %%mm0, 40(%%edi) \n\t"
2272 "movq %%mm0, 48(%%edi) \n\t"
2273 "subl $8, %%esi \n\t"
2274 "movq %%mm0, 56(%%edi) \n\t"
2275 "subl $64, %%edi \n\t"
2277 "jnz .loop8_pass0 \n\t"
2280 : "=c" (dummy_value_c), // output regs (dummy)
2281 "=S" (dummy_value_S),
2282 "=D" (dummy_value_D)
2284 : "1" (sptr), // esi // input regs
2288 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2289 : "%mm0" // clobber list
2293 else if (((pass == 2) || (pass == 3)) && width)
2295 // source is 8-byte RRGGBBAA
2296 // dest is 32-byte RRGGBBAA RRGGBBAA RRGGBBAA RRGGBBAA
2297 int width_mmx = ((width >> 1) << 1) ;
2301 int dummy_value_c; // fix 'forbidden register spilled'
2305 __asm__ __volatile__ (
2306 "subl $24, %%edi \n\t" // start of last block
2308 ".loop8_pass2: \n\t"
2309 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2310 "movq %%mm0, (%%edi) \n\t"
2311 "movq %%mm0, 8(%%edi) \n\t"
2312 "movq %%mm0, 16(%%edi) \n\t"
2313 "subl $8, %%esi \n\t"
2314 "movq %%mm0, 24(%%edi) \n\t"
2315 "subl $32, %%edi \n\t"
2317 "jnz .loop8_pass2 \n\t"
2320 : "=c" (dummy_value_c), // output regs (dummy)
2321 "=S" (dummy_value_S),
2322 "=D" (dummy_value_D)
2324 : "1" (sptr), // esi // input regs
2328 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2329 : "%mm0" // clobber list
2334 else if (width) // pass == 4 or 5
2336 // source is 8-byte RRGGBBAA
2337 // dest is 16-byte RRGGBBAA RRGGBBAA
2338 int width_mmx = ((width >> 1) << 1) ;
2342 int dummy_value_c; // fix 'forbidden register spilled'
2346 __asm__ __volatile__ (
2347 "subl $8, %%edi \n\t" // start of last block
2349 ".loop8_pass4: \n\t"
2350 "movq (%%esi), %%mm0 \n\t" // 7 6 5 4 3 2 1 0
2351 "movq %%mm0, (%%edi) \n\t"
2352 "subl $8, %%esi \n\t"
2353 "movq %%mm0, 8(%%edi) \n\t"
2354 "subl $16, %%edi \n\t"
2356 "jnz .loop8_pass4 \n\t"
2359 : "=c" (dummy_value_c), // output regs (dummy)
2360 "=S" (dummy_value_S),
2361 "=D" (dummy_value_D)
2363 : "1" (sptr), // esi // input regs
2367 #if 0 /* %mm0 not supported by gcc 2.7.2.3 or egcs 1.1 */
2368 : "%mm0" // clobber list
2374 } /* end of pixel_bytes == 8 */
2376 //--------------------------------------------------------------
2377 else if (pixel_bytes == 6)
2379 for (i = width; i; i--)
2383 png_memcpy(v, sptr, 6);
2384 for (j = 0; j < png_pass_inc[pass]; j++)
2386 png_memcpy(dp, v, 6);
2391 } /* end of pixel_bytes == 6 */
2393 //--------------------------------------------------------------
2396 for (i = width; i; i--)
2400 png_memcpy(v, sptr, pixel_bytes);
2401 for (j = 0; j < png_pass_inc[pass]; j++)
2403 png_memcpy(dp, v, pixel_bytes);
2409 } // end of _mmx_supported ========================================
2411 else /* MMX not supported: use modified C code - takes advantage
2412 * of inlining of memcpy for a constant */
2413 /* GRR 19991007: does it? or should pixel_bytes in each
2414 * block be replaced with immediate value (e.g., 1)? */
2415 /* GRR 19991017: replaced with constants in each case */
2417 if (pixel_bytes == 1)
2419 for (i = width; i; i--)
2422 for (j = 0; j < png_pass_inc[pass]; j++)
2427 else if (pixel_bytes == 3)
2429 for (i = width; i; i--)
2433 png_memcpy(v, sptr, 3);
2434 for (j = 0; j < png_pass_inc[pass]; j++)
2436 png_memcpy(dp, v, 3);
2442 else if (pixel_bytes == 2)
2444 for (i = width; i; i--)
2448 png_memcpy(v, sptr, 2);
2449 for (j = 0; j < png_pass_inc[pass]; j++)
2451 png_memcpy(dp, v, 2);
2457 else if (pixel_bytes == 4)
2459 for (i = width; i; i--)
2463 png_memcpy(v, sptr, 4);
2464 for (j = 0; j < png_pass_inc[pass]; j++)
2466 png_memcpy(dp, v, 4);
2472 else if (pixel_bytes == 6)
2474 for (i = width; i; i--)
2478 png_memcpy(v, sptr, 6);
2479 for (j = 0; j < png_pass_inc[pass]; j++)
2481 png_memcpy(dp, v, 6);
2487 else if (pixel_bytes == 8)
2489 for (i = width; i; i--)
2493 png_memcpy(v, sptr, 8);
2494 for (j = 0; j < png_pass_inc[pass]; j++)
2496 png_memcpy(dp, v, 8);
2502 else // GRR: should never be reached
2504 for (i = width; i; i--)
2508 png_memcpy(v, sptr, pixel_bytes);
2509 for (j = 0; j < png_pass_inc[pass]; j++)
2511 png_memcpy(dp, v, pixel_bytes);
2514 sptr -= pixel_bytes;
2518 } /* end if (MMX not supported) */
2521 } /* end switch (row_info->pixel_depth) */
2523 row_info->width = final_width;
2524 row_info->rowbytes = ((final_width *
2525 (png_uint_32)row_info->pixel_depth + 7) >> 3);
2528 } /* end png_do_read_interlace() */
2530 #endif /* PNG_HAVE_ASSEMBLER_READ_INTERLACE */
2531 #endif /* PNG_READ_INTERLACING_SUPPORTED */
2536 // These variables are utilized in the functions below. They are declared
2537 // globally here to ensure alignment on 8-byte boundaries.
2542 } _LBCarryMask = {0x0101010101010101LL},
2543 _HBClearMask = {0x7f7f7f7f7f7f7f7fLL},
2544 _ActiveMask, _ActiveMask2, _ActiveMaskEnd, _ShiftBpp, _ShiftRem;
2549 //===========================================================================//
2551 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ A V G //
2553 //===========================================================================//
2555 // Optimized code for PNG Average filter decoder
2557 static void /* PRIVATE */
2558 png_read_filter_row_mmx_avg(png_row_infop row_info, png_bytep row,
2562 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
2566 bpp = (row_info->pixel_depth + 7) >> 3; // get # bytes per pixel
2567 _FullLength = row_info->rowbytes; // # of bytes to filter
2569 __asm__ __volatile__ (
2570 // initialize address pointers and offset
2572 "pushl %%ebx \n\t" // save index to Global Offset Table
2574 //pre "movl row, %%edi \n\t" // edi: Avg(x)
2575 "xorl %%ebx, %%ebx \n\t" // ebx: x
2576 "movl %%edi, %%edx \n\t"
2577 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
2578 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2579 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2581 "xorl %%eax,%%eax \n\t"
2583 // Compute the Raw value for the first bpp bytes
2584 // Raw(x) = Avg(x) + (Prior(x)/2)
2586 "movb (%%esi,%%ebx,),%%al \n\t" // load al with Prior(x)
2588 "shrb %%al \n\t" // divide by 2
2589 "addb -1(%%edi,%%ebx,),%%al \n\t" // add Avg(x); -1 to offset inc ebx
2590 //pre "cmpl bpp, %%ebx \n\t" // (bpp is preloaded into ecx)
2591 "cmpl %%ecx, %%ebx \n\t"
2592 "movb %%al,-1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2593 "jb avg_rlp \n\t" // mov does not affect flags
2595 // get # of bytes to alignment
2596 "movl %%edi, _dif \n\t" // take start of row
2597 "addl %%ebx, _dif \n\t" // add bpp
2598 "addl $0xf, _dif \n\t" // add 7+8 to incr past alignment bdry
2599 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
2600 "subl %%edi, _dif \n\t" // subtract from start => value ebx at
2601 "jz avg_go \n\t" // alignment
2604 // Compute the Raw value for the bytes up to the alignment boundary
2605 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2606 "xorl %%ecx, %%ecx \n\t"
2609 "xorl %%eax, %%eax \n\t"
2610 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2611 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2612 "addw %%cx, %%ax \n\t"
2614 "shrw %%ax \n\t" // divide by 2
2615 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2616 "cmpl _dif, %%ebx \n\t" // check if at alignment boundary
2617 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write Raw(x); -1 to offset inc ebx
2618 "jb avg_lp1 \n\t" // repeat until at alignment boundary
2621 "movl _FullLength, %%eax \n\t"
2622 "movl %%eax, %%ecx \n\t"
2623 "subl %%ebx, %%eax \n\t" // subtract alignment fix
2624 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
2625 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
2626 "movl %%ecx, _MMXLength \n\t"
2628 "popl %%ebx \n\t" // restore index to Global Offset Table
2631 : "=c" (dummy_value_c), // output regs (dummy)
2632 "=S" (dummy_value_S),
2633 "=D" (dummy_value_D)
2635 : "0" (bpp), // ecx // input regs
2636 "1" (prev_row), // esi
2639 : "%eax", "%edx" // clobber list
2643 // GRR: INCLUDE "memory" as clobbered? (_dif, _MMXLength)
2644 // (seems to work fine without...)
2647 // now do the math for the rest of the row
2652 _ActiveMask.use = 0x0000000000ffffffLL;
2653 _ShiftBpp.use = 24; // == 3 * 8
2654 _ShiftRem.use = 40; // == 64 - 24
2656 __asm__ __volatile__ (
2657 // re-init address pointers and offset
2658 "movq _ActiveMask, %%mm7 \n\t"
2659 "movl _dif, %%ecx \n\t" // ecx: x = offset to
2660 "movq _LBCarryMask, %%mm5 \n\t" // alignment boundary
2661 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2662 "movq _HBClearMask, %%mm4 \n\t"
2663 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2665 // prime the pump: load the first Raw(x-bpp) data set
2666 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2667 // (correct pos. in loop below)
2669 "movq (%%edi,%%ecx,), %%mm0 \n\t" // load mm0 with Avg(x)
2670 "movq %%mm5, %%mm3 \n\t"
2671 "psrlq _ShiftRem, %%mm2 \n\t" // correct position Raw(x-bpp) data
2672 "movq (%%esi,%%ecx,), %%mm1 \n\t" // load mm1 with Prior(x)
2673 "movq %%mm7, %%mm6 \n\t"
2674 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2675 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2676 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2677 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2678 // add 1st active group (Raw(x-bpp)/2) to average with LBCarry
2679 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2680 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2681 // lsb's were == 1 (only valid for active group)
2682 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2683 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2684 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2685 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2686 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2688 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2689 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 3-5
2690 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2691 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2692 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2693 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2694 // lsb's were == 1 (only valid for active group)
2695 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2696 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2697 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2698 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2699 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2702 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2703 "psllq _ShiftBpp, %%mm6 \n\t" // shift mm6 mask to cover last two
2705 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2706 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2707 // Data only needs to be shifted once here to
2708 // get the correct x-bpp offset.
2709 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2710 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2711 // lsb's were == 1 (only valid for active group)
2712 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2713 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2714 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2715 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2716 "addl $8, %%ecx \n\t"
2717 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2719 // now ready to write back to memory
2720 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2721 // move updated Raw(x) to use as Raw(x-bpp) for next loop
2722 "cmpl _MMXLength, %%ecx \n\t"
2723 "movq %%mm0, %%mm2 \n\t" // mov updated Raw(x) to mm2
2726 : "=S" (dummy_value_S), // output regs (dummy)
2727 "=D" (dummy_value_D)
2729 : "0" (prev_row), // esi // input regs
2732 : "%ecx" // clobber list
2733 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2734 , "%mm0", "%mm1", "%mm2", "%mm3"
2735 , "%mm4", "%mm5", "%mm6", "%mm7"
2743 //case 7: // who wrote this? PNG doesn't support 5 or 7 bytes/pixel
2744 //case 5: // GRR BOGUS
2746 _ActiveMask.use = 0xffffffffffffffffLL; // use shift below to clear
2747 // appropriate inactive bytes
2748 _ShiftBpp.use = bpp << 3;
2749 _ShiftRem.use = 64 - _ShiftBpp.use;
2751 __asm__ __volatile__ (
2752 "movq _HBClearMask, %%mm4 \n\t"
2754 // re-init address pointers and offset
2755 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment boundary
2757 // load _ActiveMask and clear all bytes except for 1st active group
2758 "movq _ActiveMask, %%mm7 \n\t"
2759 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2760 "psrlq _ShiftRem, %%mm7 \n\t"
2761 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2762 "movq %%mm7, %%mm6 \n\t"
2763 "movq _LBCarryMask, %%mm5 \n\t"
2764 "psllq _ShiftBpp, %%mm6 \n\t" // create mask for 2nd active group
2766 // prime the pump: load the first Raw(x-bpp) data set
2767 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2768 // (we correct pos. in loop below)
2770 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2771 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2772 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2773 // add (Prev_row/2) to average
2774 "movq %%mm5, %%mm3 \n\t"
2775 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2776 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2777 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2778 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2779 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2780 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2781 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2782 // lsb's were == 1 (only valid for active group)
2783 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2784 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2785 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2786 "pand %%mm7, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2787 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2789 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2790 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2791 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2792 "addl $8, %%ecx \n\t"
2793 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2794 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2795 // lsb's were == 1 (only valid for active group)
2796 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2797 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2798 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2799 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2800 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active
2802 "cmpl _MMXLength, %%ecx \n\t"
2803 // now ready to write back to memory
2804 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2805 // prep Raw(x-bpp) for next loop
2806 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2809 : "=S" (dummy_value_S), // output regs (dummy)
2810 "=D" (dummy_value_D)
2812 : "0" (prev_row), // esi // input regs
2815 : "%ecx" // clobber list
2816 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2817 , "%mm0", "%mm1", "%mm2", "%mm3"
2818 , "%mm4", "%mm5", "%mm6", "%mm7"
2822 break; // end 4,6 bpp
2826 _ActiveMask.use = 0x000000000000ffffLL;
2827 _ShiftBpp.use = 16; // == 2 * 8
2828 _ShiftRem.use = 48; // == 64 - 16
2830 __asm__ __volatile__ (
2832 "movq _ActiveMask, %%mm7 \n\t"
2833 // re-init address pointers and offset
2834 "movl _dif, %%ecx \n\t" // ecx: x = offset to alignment boundary
2835 "movq _LBCarryMask, %%mm5 \n\t"
2836 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2837 "movq _HBClearMask, %%mm4 \n\t"
2838 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2840 // prime the pump: load the first Raw(x-bpp) data set
2841 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2842 // (we correct pos. in loop below)
2844 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2845 "psrlq _ShiftRem, %%mm2 \n\t" // shift data to pos. correctly
2846 "movq (%%esi,%%ecx,), %%mm1 \n\t" // (GRR BUGFIX: was psllq)
2847 // add (Prev_row/2) to average
2848 "movq %%mm5, %%mm3 \n\t"
2849 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
2850 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
2851 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
2852 "movq %%mm7, %%mm6 \n\t"
2853 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
2855 // add 1st active group (Raw(x-bpp)/2) to average with _LBCarry
2856 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2857 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2858 // lsb's were == 1 (only valid for active group)
2859 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2860 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2861 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2862 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 1 bytes to add to Avg
2863 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2865 // add 2nd active group (Raw(x-bpp)/2) to average with _LBCarry
2866 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 2 & 3
2867 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2868 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2869 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2870 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2871 // lsb's were == 1 (only valid for active group)
2872 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2873 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2874 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2875 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2876 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2878 // add 3rd active group (Raw(x-bpp)/2) to average with _LBCarry
2879 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 4 & 5
2880 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2881 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2882 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2883 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2884 // lsb's were == 1 (only valid for active group)
2885 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2886 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2887 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2888 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2889 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2891 // add 4th active group (Raw(x-bpp)/2) to average with _LBCarry
2892 "psllq _ShiftBpp, %%mm6 \n\t" // shift the mm6 mask to cover bytes 6 & 7
2893 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2894 "psllq _ShiftBpp, %%mm2 \n\t" // shift data to pos. correctly
2895 "addl $8, %%ecx \n\t"
2896 "movq %%mm3, %%mm1 \n\t" // now use mm1 for getting LBCarrys
2897 "pand %%mm2, %%mm1 \n\t" // get LBCarrys for each byte where both
2898 // lsb's were == 1 (only valid for active group)
2899 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
2900 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
2901 "paddb %%mm1, %%mm2 \n\t" // add LBCarrys to (Raw(x-bpp)/2) for each byte
2902 "pand %%mm6, %%mm2 \n\t" // leave only Active Group 2 bytes to add to Avg
2903 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) + LBCarrys to Avg for each Active byte
2905 "cmpl _MMXLength, %%ecx \n\t"
2906 // now ready to write back to memory
2907 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
2908 // prep Raw(x-bpp) for next loop
2909 "movq %%mm0, %%mm2 \n\t" // mov updated Raws to mm2
2912 : "=S" (dummy_value_S), // output regs (dummy)
2913 "=D" (dummy_value_D)
2915 : "0" (prev_row), // esi // input regs
2918 : "%ecx" // clobber list
2919 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
2920 , "%mm0", "%mm1", "%mm2", "%mm3"
2921 , "%mm4", "%mm5", "%mm6", "%mm7"
2929 __asm__ __volatile__ (
2930 // re-init address pointers and offset
2932 "pushl %%ebx \n\t" // save Global Offset Table index
2934 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment boundary
2935 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2936 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
2938 // do Paeth decode for remaining bytes
2939 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2940 "movl %%edi, %%edx \n\t"
2941 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
2942 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
2943 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
2946 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
2947 "xorl %%eax, %%eax \n\t"
2948 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
2949 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
2950 "addw %%cx, %%ax \n\t"
2952 "shrw %%ax \n\t" // divide by 2
2953 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
2954 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
2955 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x);
2956 // mov does not affect flags; -1 to offset inc ebx
2961 "popl %%ebx \n\t" // Global Offset Table index
2964 : "=c" (dummy_value_c), // output regs (dummy)
2965 "=S" (dummy_value_S),
2966 "=D" (dummy_value_D)
2968 : "0" (bpp), // ecx // input regs
2969 "1" (prev_row), // esi
2972 : "%eax", "%edx" // clobber list
2978 return; // end 1 bpp
2982 __asm__ __volatile__ (
2983 // re-init address pointers and offset
2984 "movl _dif, %%ecx \n\t" // ecx: x == offset to alignment
2985 "movq _LBCarryMask, %%mm5 \n\t" // boundary
2986 // preload "movl row, %%edi \n\t" // edi: Avg(x)
2987 "movq _HBClearMask, %%mm4 \n\t"
2988 // preload "movl prev_row, %%esi \n\t" // esi: Prior(x)
2990 // prime the pump: load the first Raw(x-bpp) data set
2991 "movq -8(%%edi,%%ecx,), %%mm2 \n\t" // load previous aligned 8 bytes
2992 // (NO NEED to correct pos. in loop below)
2995 "movq (%%edi,%%ecx,), %%mm0 \n\t"
2996 "movq %%mm5, %%mm3 \n\t"
2997 "movq (%%esi,%%ecx,), %%mm1 \n\t"
2998 "addl $8, %%ecx \n\t"
2999 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3000 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3001 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte
3002 // where both lsb's were == 1
3003 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3004 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7, each byte
3005 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg, each byte
3006 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7, each byte
3007 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg, each
3008 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each
3009 "cmpl _MMXLength, %%ecx \n\t"
3010 "movq %%mm0, -8(%%edi,%%ecx,) \n\t"
3011 "movq %%mm0, %%mm2 \n\t" // reuse as Raw(x-bpp)
3014 : "=S" (dummy_value_S), // output regs (dummy)
3015 "=D" (dummy_value_D)
3017 : "0" (prev_row), // esi // input regs
3020 : "%ecx" // clobber list
3021 #if 0 /* %mm0, ..., %mm5 not supported by gcc 2.7.2.3 or egcs 1.1 */
3022 , "%mm0", "%mm1", "%mm2"
3023 , "%mm3", "%mm4", "%mm5"
3029 default: // bpp greater than 8 (!= 1,2,3,4,[5],6,[7],8)
3032 // GRR: PRINT ERROR HERE: SHOULD NEVER BE REACHED
3034 "libpng: internal logic error (png_read_filter_row_mmx_avg())\n");
3037 __asm__ __volatile__ (
3038 "movq _LBCarryMask, %%mm5 \n\t"
3039 // re-init address pointers and offset
3040 "movl _dif, %%ebx \n\t" // ebx: x = offset to alignment boundary
3041 "movl row, %%edi \n\t" // edi: Avg(x)
3042 "movq _HBClearMask, %%mm4 \n\t"
3043 "movl %%edi, %%edx \n\t"
3044 "movl prev_row, %%esi \n\t" // esi: Prior(x)
3045 "subl bpp, %%edx \n\t" // edx: Raw(x-bpp)
3047 "movq (%%edi,%%ebx,), %%mm0 \n\t"
3048 "movq %%mm5, %%mm3 \n\t"
3049 "movq (%%esi,%%ebx,), %%mm1 \n\t"
3050 "pand %%mm1, %%mm3 \n\t" // get lsb for each prev_row byte
3051 "movq (%%edx,%%ebx,), %%mm2 \n\t"
3052 "psrlq $1, %%mm1 \n\t" // divide prev_row bytes by 2
3053 "pand %%mm2, %%mm3 \n\t" // get LBCarrys for each byte where both
3055 "psrlq $1, %%mm2 \n\t" // divide raw bytes by 2
3056 "pand %%mm4, %%mm1 \n\t" // clear invalid bit 7 of each byte
3057 "paddb %%mm3, %%mm0 \n\t" // add LBCarrys to Avg for each byte
3058 "pand %%mm4, %%mm2 \n\t" // clear invalid bit 7 of each byte
3059 "paddb %%mm1, %%mm0 \n\t" // add (Prev_row/2) to Avg for each byte
3060 "addl $8, %%ebx \n\t"
3061 "paddb %%mm2, %%mm0 \n\t" // add (Raw/2) to Avg for each byte
3062 "cmpl _MMXLength, %%ebx \n\t"
3063 "movq %%mm0, -8(%%edi,%%ebx,) \n\t"
3066 : // FIXASM: output regs/vars go here, e.g.: "=m" (memory_var)
3068 : // FIXASM: input regs, e.g.: "c" (count), "S" (src), "D" (dest)
3070 : "%ebx", "%edx", "%edi", "%esi" // CHECKASM: clobber list
3072 #endif /* 0 - NEVER REACHED */
3076 } // end switch (bpp)
3078 __asm__ __volatile__ (
3079 // MMX acceleration complete; now do clean-up
3080 // check if any remaining bytes left to decode
3082 "pushl %%ebx \n\t" // save index to Global Offset Table
3084 "movl _MMXLength, %%ebx \n\t" // ebx: x == offset bytes after MMX
3085 //pre "movl row, %%edi \n\t" // edi: Avg(x)
3086 "cmpl _FullLength, %%ebx \n\t" // test if offset at end of array
3089 // do Avg decode for remaining bytes
3090 //pre "movl prev_row, %%esi \n\t" // esi: Prior(x)
3091 "movl %%edi, %%edx \n\t"
3092 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3093 "subl %%ecx, %%edx \n\t" // edx: Raw(x-bpp)
3094 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
3097 // Raw(x) = Avg(x) + ((Raw(x-bpp) + Prior(x))/2)
3098 "xorl %%eax, %%eax \n\t"
3099 "movb (%%esi,%%ebx,), %%cl \n\t" // load cl with Prior(x)
3100 "movb (%%edx,%%ebx,), %%al \n\t" // load al with Raw(x-bpp)
3101 "addw %%cx, %%ax \n\t"
3103 "shrw %%ax \n\t" // divide by 2
3104 "addb -1(%%edi,%%ebx,), %%al \n\t" // add Avg(x); -1 to offset inc ebx
3105 "cmpl _FullLength, %%ebx \n\t" // check if at end of array
3106 "movb %%al, -1(%%edi,%%ebx,) \n\t" // write back Raw(x) [mov does not
3107 "jb avg_lp2 \n\t" // affect flags; -1 to offset inc ebx]
3110 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
3112 "popl %%ebx \n\t" // restore index to Global Offset Table
3115 : "=c" (dummy_value_c), // output regs (dummy)
3116 "=S" (dummy_value_S),
3117 "=D" (dummy_value_D)
3119 : "0" (bpp), // ecx // input regs
3120 "1" (prev_row), // esi
3123 : "%eax", "%edx" // clobber list
3129 } /* end png_read_filter_row_mmx_avg() */
3134 //===========================================================================//
3136 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ P A E T H //
3138 //===========================================================================//
3140 // Optimized code for PNG Paeth filter decoder
3142 static void /* PRIVATE */
3143 png_read_filter_row_mmx_paeth(png_row_infop row_info, png_bytep row,
3147 int dummy_value_c; // fix 'forbidden register 2 (cx) was spilled' error
3151 bpp = (row_info->pixel_depth + 7) >> 3; // Get # bytes per pixel
3152 _FullLength = row_info->rowbytes; // # of bytes to filter
3154 __asm__ __volatile__ (
3156 "pushl %%ebx \n\t" // save index to Global Offset Table
3158 "xorl %%ebx, %%ebx \n\t" // ebx: x offset
3159 //pre "movl row, %%edi \n\t"
3160 "xorl %%edx, %%edx \n\t" // edx: x-bpp offset
3161 //pre "movl prev_row, %%esi \n\t"
3162 "xorl %%eax, %%eax \n\t"
3164 // Compute the Raw value for the first bpp bytes
3165 // Note: the formula works out to be always
3166 // Paeth(x) = Raw(x) + Prior(x) where x < bpp
3168 "movb (%%edi,%%ebx,), %%al \n\t"
3169 "addb (%%esi,%%ebx,), %%al \n\t"
3171 //pre "cmpl bpp, %%ebx \n\t" (bpp is preloaded into ecx)
3172 "cmpl %%ecx, %%ebx \n\t"
3173 "movb %%al, -1(%%edi,%%ebx,) \n\t"
3175 // get # of bytes to alignment
3176 "movl %%edi, _dif \n\t" // take start of row
3177 "addl %%ebx, _dif \n\t" // add bpp
3178 "xorl %%ecx, %%ecx \n\t"
3179 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past alignment boundary
3180 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
3181 "subl %%edi, _dif \n\t" // subtract from start ==> value ebx at alignment
3186 "xorl %%eax, %%eax \n\t"
3187 // pav = p - a = (a + b - c) - a = b - c
3188 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3189 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3190 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3191 "movl %%eax, _patemp \n\t" // Save pav for later use
3192 "xorl %%eax, %%eax \n\t"
3193 // pbv = p - b = (a + b - c) - b = a - c
3194 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3195 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3196 "movl %%eax, %%ecx \n\t"
3197 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3198 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3200 "testl $0x80000000, %%eax \n\t"
3202 "negl %%eax \n\t" // reverse sign of neg values
3205 "movl %%eax, _pctemp \n\t" // save pc for later use
3207 "testl $0x80000000, %%ecx \n\t"
3209 "negl %%ecx \n\t" // reverse sign of neg values
3212 "movl %%ecx, _pbtemp \n\t" // save pb for later use
3214 "movl _patemp, %%eax \n\t"
3215 "testl $0x80000000, %%eax \n\t"
3217 "negl %%eax \n\t" // reverse sign of neg values
3220 "movl %%eax, _patemp \n\t" // save pa for later use
3222 "cmpl %%ecx, %%eax \n\t"
3223 "jna paeth_abb \n\t"
3224 // pa > pb; now test if pb <= pc
3225 "cmpl _pctemp, %%ecx \n\t"
3226 "jna paeth_bbc \n\t"
3227 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3228 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3229 "jmp paeth_paeth \n\t"
3232 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
3233 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
3234 "jmp paeth_paeth \n\t"
3237 // pa <= pb; now test if pa <= pc
3238 "cmpl _pctemp, %%eax \n\t"
3239 "jna paeth_abc \n\t"
3240 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
3241 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3242 "jmp paeth_paeth \n\t"
3245 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
3246 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
3251 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
3252 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
3253 "cmpl _dif, %%ebx \n\t"
3257 "movl _FullLength, %%ecx \n\t"
3258 "movl %%ecx, %%eax \n\t"
3259 "subl %%ebx, %%eax \n\t" // subtract alignment fix
3260 "andl $0x00000007, %%eax \n\t" // calc bytes over mult of 8
3261 "subl %%eax, %%ecx \n\t" // drop over bytes from original length
3262 "movl %%ecx, _MMXLength \n\t"
3264 "popl %%ebx \n\t" // restore index to Global Offset Table
3267 : "=c" (dummy_value_c), // output regs (dummy)
3268 "=S" (dummy_value_S),
3269 "=D" (dummy_value_D)
3271 : "0" (bpp), // ecx // input regs
3272 "1" (prev_row), // esi
3275 : "%eax", "%edx" // clobber list
3281 // now do the math for the rest of the row
3286 _ActiveMask.use = 0x0000000000ffffffLL;
3287 _ActiveMaskEnd.use = 0xffff000000000000LL;
3288 _ShiftBpp.use = 24; // == bpp(3) * 8
3289 _ShiftRem.use = 40; // == 64 - 24
3291 __asm__ __volatile__ (
3292 "movl _dif, %%ecx \n\t"
3293 // preload "movl row, %%edi \n\t"
3294 // preload "movl prev_row, %%esi \n\t"
3295 "pxor %%mm0, %%mm0 \n\t"
3296 // prime the pump: load the first Raw(x-bpp) data set
3297 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3299 "psrlq _ShiftRem, %%mm1 \n\t" // shift last 3 bytes to 1st 3 bytes
3300 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3301 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3302 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // prep c=Prior(x-bpp) bytes
3303 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3304 "psrlq _ShiftRem, %%mm3 \n\t" // shift last 3 bytes to 1st 3 bytes
3305 // pav = p - a = (a + b - c) - a = b - c
3306 "movq %%mm2, %%mm4 \n\t"
3307 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3308 // pbv = p - b = (a + b - c) - b = a - c
3309 "movq %%mm1, %%mm5 \n\t"
3310 "psubw %%mm3, %%mm4 \n\t"
3311 "pxor %%mm7, %%mm7 \n\t"
3312 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3313 "movq %%mm4, %%mm6 \n\t"
3314 "psubw %%mm3, %%mm5 \n\t"
3316 // pa = abs(p-a) = abs(pav)
3317 // pb = abs(p-b) = abs(pbv)
3318 // pc = abs(p-c) = abs(pcv)
3319 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3320 "paddw %%mm5, %%mm6 \n\t"
3321 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3322 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3323 "psubw %%mm0, %%mm4 \n\t"
3324 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3325 "psubw %%mm0, %%mm4 \n\t"
3326 "psubw %%mm7, %%mm5 \n\t"
3327 "pxor %%mm0, %%mm0 \n\t"
3328 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3329 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3330 "psubw %%mm7, %%mm5 \n\t"
3331 "psubw %%mm0, %%mm6 \n\t"
3333 "movq %%mm4, %%mm7 \n\t"
3334 "psubw %%mm0, %%mm6 \n\t"
3335 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3336 "movq %%mm7, %%mm0 \n\t"
3337 // use mm7 mask to merge pa & pb
3338 "pand %%mm7, %%mm5 \n\t"
3339 // use mm0 mask copy to merge a & b
3340 "pand %%mm0, %%mm2 \n\t"
3341 "pandn %%mm4, %%mm7 \n\t"
3342 "pandn %%mm1, %%mm0 \n\t"
3343 "paddw %%mm5, %%mm7 \n\t"
3344 "paddw %%mm2, %%mm0 \n\t"
3345 // test ((pa <= pb)? pa:pb) <= pc
3346 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3347 "pxor %%mm1, %%mm1 \n\t"
3348 "pand %%mm7, %%mm3 \n\t"
3349 "pandn %%mm0, %%mm7 \n\t"
3350 "paddw %%mm3, %%mm7 \n\t"
3351 "pxor %%mm0, %%mm0 \n\t"
3352 "packuswb %%mm1, %%mm7 \n\t"
3353 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3354 "pand _ActiveMask, %%mm7 \n\t"
3355 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3356 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3357 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3358 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3359 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
3360 // now do Paeth for 2nd set of bytes (3-5)
3361 "psrlq _ShiftBpp, %%mm2 \n\t" // load b=Prior(x) step 2
3362 "punpcklbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3363 "pxor %%mm7, %%mm7 \n\t"
3364 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3365 // pbv = p - b = (a + b - c) - b = a - c
3366 "movq %%mm1, %%mm5 \n\t"
3367 // pav = p - a = (a + b - c) - a = b - c
3368 "movq %%mm2, %%mm4 \n\t"
3369 "psubw %%mm3, %%mm5 \n\t"
3370 "psubw %%mm3, %%mm4 \n\t"
3371 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) =
3372 // pav + pbv = pbv + pav
3373 "movq %%mm5, %%mm6 \n\t"
3374 "paddw %%mm4, %%mm6 \n\t"
3376 // pa = abs(p-a) = abs(pav)
3377 // pb = abs(p-b) = abs(pbv)
3378 // pc = abs(p-c) = abs(pcv)
3379 "pcmpgtw %%mm5, %%mm0 \n\t" // create mask pbv bytes < 0
3380 "pcmpgtw %%mm4, %%mm7 \n\t" // create mask pav bytes < 0
3381 "pand %%mm5, %%mm0 \n\t" // only pbv bytes < 0 in mm0
3382 "pand %%mm4, %%mm7 \n\t" // only pav bytes < 0 in mm7
3383 "psubw %%mm0, %%mm5 \n\t"
3384 "psubw %%mm7, %%mm4 \n\t"
3385 "psubw %%mm0, %%mm5 \n\t"
3386 "psubw %%mm7, %%mm4 \n\t"
3387 "pxor %%mm0, %%mm0 \n\t"
3388 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3389 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3390 "psubw %%mm0, %%mm6 \n\t"
3392 "movq %%mm4, %%mm7 \n\t"
3393 "psubw %%mm0, %%mm6 \n\t"
3394 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3395 "movq %%mm7, %%mm0 \n\t"
3396 // use mm7 mask to merge pa & pb
3397 "pand %%mm7, %%mm5 \n\t"
3398 // use mm0 mask copy to merge a & b
3399 "pand %%mm0, %%mm2 \n\t"
3400 "pandn %%mm4, %%mm7 \n\t"
3401 "pandn %%mm1, %%mm0 \n\t"
3402 "paddw %%mm5, %%mm7 \n\t"
3403 "paddw %%mm2, %%mm0 \n\t"
3404 // test ((pa <= pb)? pa:pb) <= pc
3405 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3406 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3407 "pand %%mm7, %%mm3 \n\t"
3408 "pandn %%mm0, %%mm7 \n\t"
3409 "pxor %%mm1, %%mm1 \n\t"
3410 "paddw %%mm3, %%mm7 \n\t"
3411 "pxor %%mm0, %%mm0 \n\t"
3412 "packuswb %%mm1, %%mm7 \n\t"
3413 "movq %%mm2, %%mm3 \n\t" // load c=Prior(x-bpp) step 1
3414 "pand _ActiveMask, %%mm7 \n\t"
3415 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3416 "psllq _ShiftBpp, %%mm7 \n\t" // shift bytes to 2nd group of 3 bytes
3417 // pav = p - a = (a + b - c) - a = b - c
3418 "movq %%mm2, %%mm4 \n\t"
3419 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3420 "psllq _ShiftBpp, %%mm3 \n\t" // load c=Prior(x-bpp) step 2
3421 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3422 "movq %%mm7, %%mm1 \n\t"
3423 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3424 "psllq _ShiftBpp, %%mm1 \n\t" // shift bytes
3425 // now mm1 will be used as Raw(x-bpp)
3426 // now do Paeth for 3rd, and final, set of bytes (6-7)
3427 "pxor %%mm7, %%mm7 \n\t"
3428 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3429 "psubw %%mm3, %%mm4 \n\t"
3430 // pbv = p - b = (a + b - c) - b = a - c
3431 "movq %%mm1, %%mm5 \n\t"
3432 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3433 "movq %%mm4, %%mm6 \n\t"
3434 "psubw %%mm3, %%mm5 \n\t"
3435 "pxor %%mm0, %%mm0 \n\t"
3436 "paddw %%mm5, %%mm6 \n\t"
3438 // pa = abs(p-a) = abs(pav)
3439 // pb = abs(p-b) = abs(pbv)
3440 // pc = abs(p-c) = abs(pcv)
3441 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3442 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3443 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3444 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3445 "psubw %%mm0, %%mm4 \n\t"
3446 "psubw %%mm7, %%mm5 \n\t"
3447 "psubw %%mm0, %%mm4 \n\t"
3448 "psubw %%mm7, %%mm5 \n\t"
3449 "pxor %%mm0, %%mm0 \n\t"
3450 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3451 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3452 "psubw %%mm0, %%mm6 \n\t"
3454 "movq %%mm4, %%mm7 \n\t"
3455 "psubw %%mm0, %%mm6 \n\t"
3456 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3457 "movq %%mm7, %%mm0 \n\t"
3458 // use mm0 mask copy to merge a & b
3459 "pand %%mm0, %%mm2 \n\t"
3460 // use mm7 mask to merge pa & pb
3461 "pand %%mm7, %%mm5 \n\t"
3462 "pandn %%mm1, %%mm0 \n\t"
3463 "pandn %%mm4, %%mm7 \n\t"
3464 "paddw %%mm2, %%mm0 \n\t"
3465 "paddw %%mm5, %%mm7 \n\t"
3466 // test ((pa <= pb)? pa:pb) <= pc
3467 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3468 "pand %%mm7, %%mm3 \n\t"
3469 "pandn %%mm0, %%mm7 \n\t"
3470 "paddw %%mm3, %%mm7 \n\t"
3471 "pxor %%mm1, %%mm1 \n\t"
3472 "packuswb %%mm7, %%mm1 \n\t"
3473 // step ecx to next set of 8 bytes and repeat loop til done
3474 "addl $8, %%ecx \n\t"
3475 "pand _ActiveMaskEnd, %%mm1 \n\t"
3476 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3478 "cmpl _MMXLength, %%ecx \n\t"
3479 "pxor %%mm0, %%mm0 \n\t" // pxor does not affect flags
3480 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3481 // mm1 will be used as Raw(x-bpp) next loop
3482 // mm3 ready to be used as Prior(x-bpp) next loop
3485 : "=S" (dummy_value_S), // output regs (dummy)
3486 "=D" (dummy_value_D)
3488 : "0" (prev_row), // esi // input regs
3491 : "%ecx" // clobber list
3492 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3493 , "%mm0", "%mm1", "%mm2", "%mm3"
3494 , "%mm4", "%mm5", "%mm6", "%mm7"
3501 //case 7: // GRR BOGUS
3502 //case 5: // GRR BOGUS
3504 _ActiveMask.use = 0x00000000ffffffffLL;
3505 _ActiveMask2.use = 0xffffffff00000000LL;
3506 _ShiftBpp.use = bpp << 3; // == bpp * 8
3507 _ShiftRem.use = 64 - _ShiftBpp.use;
3509 __asm__ __volatile__ (
3510 "movl _dif, %%ecx \n\t"
3511 // preload "movl row, %%edi \n\t"
3512 // preload "movl prev_row, %%esi \n\t"
3513 // prime the pump: load the first Raw(x-bpp) data set
3514 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3515 "pxor %%mm0, %%mm0 \n\t"
3518 // must shift to position Raw(x-bpp) data
3519 "psrlq _ShiftRem, %%mm1 \n\t"
3520 // do first set of 4 bytes
3521 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3522 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3523 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3524 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3525 // must shift to position Prior(x-bpp) data
3526 "psrlq _ShiftRem, %%mm3 \n\t"
3527 // pav = p - a = (a + b - c) - a = b - c
3528 "movq %%mm2, %%mm4 \n\t"
3529 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3530 // pbv = p - b = (a + b - c) - b = a - c
3531 "movq %%mm1, %%mm5 \n\t"
3532 "psubw %%mm3, %%mm4 \n\t"
3533 "pxor %%mm7, %%mm7 \n\t"
3534 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3535 "movq %%mm4, %%mm6 \n\t"
3536 "psubw %%mm3, %%mm5 \n\t"
3537 // pa = abs(p-a) = abs(pav)
3538 // pb = abs(p-b) = abs(pbv)
3539 // pc = abs(p-c) = abs(pcv)
3540 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3541 "paddw %%mm5, %%mm6 \n\t"
3542 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3543 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3544 "psubw %%mm0, %%mm4 \n\t"
3545 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3546 "psubw %%mm0, %%mm4 \n\t"
3547 "psubw %%mm7, %%mm5 \n\t"
3548 "pxor %%mm0, %%mm0 \n\t"
3549 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3550 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3551 "psubw %%mm7, %%mm5 \n\t"
3552 "psubw %%mm0, %%mm6 \n\t"
3554 "movq %%mm4, %%mm7 \n\t"
3555 "psubw %%mm0, %%mm6 \n\t"
3556 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3557 "movq %%mm7, %%mm0 \n\t"
3558 // use mm7 mask to merge pa & pb
3559 "pand %%mm7, %%mm5 \n\t"
3560 // use mm0 mask copy to merge a & b
3561 "pand %%mm0, %%mm2 \n\t"
3562 "pandn %%mm4, %%mm7 \n\t"
3563 "pandn %%mm1, %%mm0 \n\t"
3564 "paddw %%mm5, %%mm7 \n\t"
3565 "paddw %%mm2, %%mm0 \n\t"
3566 // test ((pa <= pb)? pa:pb) <= pc
3567 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3568 "pxor %%mm1, %%mm1 \n\t"
3569 "pand %%mm7, %%mm3 \n\t"
3570 "pandn %%mm0, %%mm7 \n\t"
3571 "paddw %%mm3, %%mm7 \n\t"
3572 "pxor %%mm0, %%mm0 \n\t"
3573 "packuswb %%mm1, %%mm7 \n\t"
3574 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3575 "pand _ActiveMask, %%mm7 \n\t"
3576 "psrlq _ShiftRem, %%mm3 \n\t"
3577 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x) step 1
3578 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor and Raw(x)
3579 "movq %%mm2, %%mm6 \n\t"
3580 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3581 "movq -8(%%edi,%%ecx,), %%mm1 \n\t"
3582 "psllq _ShiftBpp, %%mm6 \n\t"
3583 "movq %%mm7, %%mm5 \n\t"
3584 "psrlq _ShiftRem, %%mm1 \n\t"
3585 "por %%mm6, %%mm3 \n\t"
3586 "psllq _ShiftBpp, %%mm5 \n\t"
3587 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3588 "por %%mm5, %%mm1 \n\t"
3589 // do second set of 4 bytes
3590 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3591 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3592 // pav = p - a = (a + b - c) - a = b - c
3593 "movq %%mm2, %%mm4 \n\t"
3594 // pbv = p - b = (a + b - c) - b = a - c
3595 "movq %%mm1, %%mm5 \n\t"
3596 "psubw %%mm3, %%mm4 \n\t"
3597 "pxor %%mm7, %%mm7 \n\t"
3598 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3599 "movq %%mm4, %%mm6 \n\t"
3600 "psubw %%mm3, %%mm5 \n\t"
3601 // pa = abs(p-a) = abs(pav)
3602 // pb = abs(p-b) = abs(pbv)
3603 // pc = abs(p-c) = abs(pcv)
3604 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3605 "paddw %%mm5, %%mm6 \n\t"
3606 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3607 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3608 "psubw %%mm0, %%mm4 \n\t"
3609 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3610 "psubw %%mm0, %%mm4 \n\t"
3611 "psubw %%mm7, %%mm5 \n\t"
3612 "pxor %%mm0, %%mm0 \n\t"
3613 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3614 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3615 "psubw %%mm7, %%mm5 \n\t"
3616 "psubw %%mm0, %%mm6 \n\t"
3618 "movq %%mm4, %%mm7 \n\t"
3619 "psubw %%mm0, %%mm6 \n\t"
3620 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3621 "movq %%mm7, %%mm0 \n\t"
3622 // use mm7 mask to merge pa & pb
3623 "pand %%mm7, %%mm5 \n\t"
3624 // use mm0 mask copy to merge a & b
3625 "pand %%mm0, %%mm2 \n\t"
3626 "pandn %%mm4, %%mm7 \n\t"
3627 "pandn %%mm1, %%mm0 \n\t"
3628 "paddw %%mm5, %%mm7 \n\t"
3629 "paddw %%mm2, %%mm0 \n\t"
3630 // test ((pa <= pb)? pa:pb) <= pc
3631 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3632 "pxor %%mm1, %%mm1 \n\t"
3633 "pand %%mm7, %%mm3 \n\t"
3634 "pandn %%mm0, %%mm7 \n\t"
3635 "pxor %%mm1, %%mm1 \n\t"
3636 "paddw %%mm3, %%mm7 \n\t"
3637 "pxor %%mm0, %%mm0 \n\t"
3638 // step ecx to next set of 8 bytes and repeat loop til done
3639 "addl $8, %%ecx \n\t"
3640 "packuswb %%mm7, %%mm1 \n\t"
3641 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3642 "cmpl _MMXLength, %%ecx \n\t"
3643 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3644 // mm1 will be used as Raw(x-bpp) next loop
3647 : "=S" (dummy_value_S), // output regs (dummy)
3648 "=D" (dummy_value_D)
3650 : "0" (prev_row), // esi // input regs
3653 : "%ecx" // clobber list
3654 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3655 , "%mm0", "%mm1", "%mm2", "%mm3"
3656 , "%mm4", "%mm5", "%mm6", "%mm7"
3664 _ActiveMask.use = 0x00000000ffffffffLL;
3666 __asm__ __volatile__ (
3667 "movl _dif, %%ecx \n\t"
3668 // preload "movl row, %%edi \n\t"
3669 // preload "movl prev_row, %%esi \n\t"
3670 "pxor %%mm0, %%mm0 \n\t"
3671 // prime the pump: load the first Raw(x-bpp) data set
3672 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3673 // a=Raw(x-bpp) bytes
3675 // do first set of 4 bytes
3676 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3677 "punpckhbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3678 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3679 "punpcklbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3680 // pav = p - a = (a + b - c) - a = b - c
3681 "movq %%mm2, %%mm4 \n\t"
3682 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3683 // pbv = p - b = (a + b - c) - b = a - c
3684 "movq %%mm1, %%mm5 \n\t"
3685 "psubw %%mm3, %%mm4 \n\t"
3686 "pxor %%mm7, %%mm7 \n\t"
3687 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3688 "movq %%mm4, %%mm6 \n\t"
3689 "psubw %%mm3, %%mm5 \n\t"
3690 // pa = abs(p-a) = abs(pav)
3691 // pb = abs(p-b) = abs(pbv)
3692 // pc = abs(p-c) = abs(pcv)
3693 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3694 "paddw %%mm5, %%mm6 \n\t"
3695 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3696 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3697 "psubw %%mm0, %%mm4 \n\t"
3698 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3699 "psubw %%mm0, %%mm4 \n\t"
3700 "psubw %%mm7, %%mm5 \n\t"
3701 "pxor %%mm0, %%mm0 \n\t"
3702 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3703 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3704 "psubw %%mm7, %%mm5 \n\t"
3705 "psubw %%mm0, %%mm6 \n\t"
3707 "movq %%mm4, %%mm7 \n\t"
3708 "psubw %%mm0, %%mm6 \n\t"
3709 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3710 "movq %%mm7, %%mm0 \n\t"
3711 // use mm7 mask to merge pa & pb
3712 "pand %%mm7, %%mm5 \n\t"
3713 // use mm0 mask copy to merge a & b
3714 "pand %%mm0, %%mm2 \n\t"
3715 "pandn %%mm4, %%mm7 \n\t"
3716 "pandn %%mm1, %%mm0 \n\t"
3717 "paddw %%mm5, %%mm7 \n\t"
3718 "paddw %%mm2, %%mm0 \n\t"
3719 // test ((pa <= pb)? pa:pb) <= pc
3720 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3721 "pxor %%mm1, %%mm1 \n\t"
3722 "pand %%mm7, %%mm3 \n\t"
3723 "pandn %%mm0, %%mm7 \n\t"
3724 "paddw %%mm3, %%mm7 \n\t"
3725 "pxor %%mm0, %%mm0 \n\t"
3726 "packuswb %%mm1, %%mm7 \n\t"
3727 "movq (%%esi,%%ecx,), %%mm3 \n\t" // load c=Prior(x-bpp)
3728 "pand _ActiveMask, %%mm7 \n\t"
3729 "movq %%mm3, %%mm2 \n\t" // load b=Prior(x) step 1
3730 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3731 "punpcklbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3732 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3733 "movq %%mm7, %%mm1 \n\t" // now mm1 will be used as Raw(x-bpp)
3734 // do second set of 4 bytes
3735 "punpckhbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3736 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3737 // pav = p - a = (a + b - c) - a = b - c
3738 "movq %%mm2, %%mm4 \n\t"
3739 // pbv = p - b = (a + b - c) - b = a - c
3740 "movq %%mm1, %%mm5 \n\t"
3741 "psubw %%mm3, %%mm4 \n\t"
3742 "pxor %%mm7, %%mm7 \n\t"
3743 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3744 "movq %%mm4, %%mm6 \n\t"
3745 "psubw %%mm3, %%mm5 \n\t"
3746 // pa = abs(p-a) = abs(pav)
3747 // pb = abs(p-b) = abs(pbv)
3748 // pc = abs(p-c) = abs(pcv)
3749 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3750 "paddw %%mm5, %%mm6 \n\t"
3751 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3752 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3753 "psubw %%mm0, %%mm4 \n\t"
3754 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3755 "psubw %%mm0, %%mm4 \n\t"
3756 "psubw %%mm7, %%mm5 \n\t"
3757 "pxor %%mm0, %%mm0 \n\t"
3758 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3759 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3760 "psubw %%mm7, %%mm5 \n\t"
3761 "psubw %%mm0, %%mm6 \n\t"
3763 "movq %%mm4, %%mm7 \n\t"
3764 "psubw %%mm0, %%mm6 \n\t"
3765 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3766 "movq %%mm7, %%mm0 \n\t"
3767 // use mm7 mask to merge pa & pb
3768 "pand %%mm7, %%mm5 \n\t"
3769 // use mm0 mask copy to merge a & b
3770 "pand %%mm0, %%mm2 \n\t"
3771 "pandn %%mm4, %%mm7 \n\t"
3772 "pandn %%mm1, %%mm0 \n\t"
3773 "paddw %%mm5, %%mm7 \n\t"
3774 "paddw %%mm2, %%mm0 \n\t"
3775 // test ((pa <= pb)? pa:pb) <= pc
3776 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3777 "pxor %%mm1, %%mm1 \n\t"
3778 "pand %%mm7, %%mm3 \n\t"
3779 "pandn %%mm0, %%mm7 \n\t"
3780 "pxor %%mm1, %%mm1 \n\t"
3781 "paddw %%mm3, %%mm7 \n\t"
3782 "pxor %%mm0, %%mm0 \n\t"
3783 // step ecx to next set of 8 bytes and repeat loop til done
3784 "addl $8, %%ecx \n\t"
3785 "packuswb %%mm7, %%mm1 \n\t"
3786 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add predictor with Raw(x)
3787 "cmpl _MMXLength, %%ecx \n\t"
3788 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3789 // mm1 will be used as Raw(x-bpp) next loop
3792 : "=S" (dummy_value_S), // output regs (dummy)
3793 "=D" (dummy_value_D)
3795 : "0" (prev_row), // esi // input regs
3798 : "%ecx" // clobber list
3799 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3800 , "%mm0", "%mm1", "%mm2", "%mm3"
3801 , "%mm4", "%mm5", "%mm6", "%mm7"
3809 _ActiveMask.use = 0x00000000ffffffffLL;
3811 __asm__ __volatile__ (
3812 "movl _dif, %%ecx \n\t"
3813 // preload "movl row, %%edi \n\t"
3814 // preload "movl prev_row, %%esi \n\t"
3815 "pxor %%mm0, %%mm0 \n\t"
3816 // prime the pump: load the first Raw(x-bpp) data set
3817 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // only time should need to read
3818 // a=Raw(x-bpp) bytes
3820 // do first set of 4 bytes
3821 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3822 "punpcklbw %%mm0, %%mm1 \n\t" // unpack Low bytes of a
3823 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3824 "punpcklbw %%mm0, %%mm2 \n\t" // unpack Low bytes of b
3825 // pav = p - a = (a + b - c) - a = b - c
3826 "movq %%mm2, %%mm4 \n\t"
3827 "punpcklbw %%mm0, %%mm3 \n\t" // unpack Low bytes of c
3828 // pbv = p - b = (a + b - c) - b = a - c
3829 "movq %%mm1, %%mm5 \n\t"
3830 "psubw %%mm3, %%mm4 \n\t"
3831 "pxor %%mm7, %%mm7 \n\t"
3832 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3833 "movq %%mm4, %%mm6 \n\t"
3834 "psubw %%mm3, %%mm5 \n\t"
3835 // pa = abs(p-a) = abs(pav)
3836 // pb = abs(p-b) = abs(pbv)
3837 // pc = abs(p-c) = abs(pcv)
3838 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3839 "paddw %%mm5, %%mm6 \n\t"
3840 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3841 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3842 "psubw %%mm0, %%mm4 \n\t"
3843 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3844 "psubw %%mm0, %%mm4 \n\t"
3845 "psubw %%mm7, %%mm5 \n\t"
3846 "pxor %%mm0, %%mm0 \n\t"
3847 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3848 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3849 "psubw %%mm7, %%mm5 \n\t"
3850 "psubw %%mm0, %%mm6 \n\t"
3852 "movq %%mm4, %%mm7 \n\t"
3853 "psubw %%mm0, %%mm6 \n\t"
3854 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3855 "movq %%mm7, %%mm0 \n\t"
3856 // use mm7 mask to merge pa & pb
3857 "pand %%mm7, %%mm5 \n\t"
3858 // use mm0 mask copy to merge a & b
3859 "pand %%mm0, %%mm2 \n\t"
3860 "pandn %%mm4, %%mm7 \n\t"
3861 "pandn %%mm1, %%mm0 \n\t"
3862 "paddw %%mm5, %%mm7 \n\t"
3863 "paddw %%mm2, %%mm0 \n\t"
3864 // test ((pa <= pb)? pa:pb) <= pc
3865 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3866 "pxor %%mm1, %%mm1 \n\t"
3867 "pand %%mm7, %%mm3 \n\t"
3868 "pandn %%mm0, %%mm7 \n\t"
3869 "paddw %%mm3, %%mm7 \n\t"
3870 "pxor %%mm0, %%mm0 \n\t"
3871 "packuswb %%mm1, %%mm7 \n\t"
3872 "movq -8(%%esi,%%ecx,), %%mm3 \n\t" // read c=Prior(x-bpp) bytes
3873 "pand _ActiveMask, %%mm7 \n\t"
3874 "movq (%%esi,%%ecx,), %%mm2 \n\t" // load b=Prior(x)
3875 "paddb (%%edi,%%ecx,), %%mm7 \n\t" // add Paeth predictor with Raw(x)
3876 "punpckhbw %%mm0, %%mm3 \n\t" // unpack High bytes of c
3877 "movq %%mm7, (%%edi,%%ecx,) \n\t" // write back updated value
3878 "movq -8(%%edi,%%ecx,), %%mm1 \n\t" // read a=Raw(x-bpp) bytes
3880 // do second set of 4 bytes
3881 "punpckhbw %%mm0, %%mm2 \n\t" // unpack High bytes of b
3882 "punpckhbw %%mm0, %%mm1 \n\t" // unpack High bytes of a
3883 // pav = p - a = (a + b - c) - a = b - c
3884 "movq %%mm2, %%mm4 \n\t"
3885 // pbv = p - b = (a + b - c) - b = a - c
3886 "movq %%mm1, %%mm5 \n\t"
3887 "psubw %%mm3, %%mm4 \n\t"
3888 "pxor %%mm7, %%mm7 \n\t"
3889 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3890 "movq %%mm4, %%mm6 \n\t"
3891 "psubw %%mm3, %%mm5 \n\t"
3892 // pa = abs(p-a) = abs(pav)
3893 // pb = abs(p-b) = abs(pbv)
3894 // pc = abs(p-c) = abs(pcv)
3895 "pcmpgtw %%mm4, %%mm0 \n\t" // create mask pav bytes < 0
3896 "paddw %%mm5, %%mm6 \n\t"
3897 "pand %%mm4, %%mm0 \n\t" // only pav bytes < 0 in mm7
3898 "pcmpgtw %%mm5, %%mm7 \n\t" // create mask pbv bytes < 0
3899 "psubw %%mm0, %%mm4 \n\t"
3900 "pand %%mm5, %%mm7 \n\t" // only pbv bytes < 0 in mm0
3901 "psubw %%mm0, %%mm4 \n\t"
3902 "psubw %%mm7, %%mm5 \n\t"
3903 "pxor %%mm0, %%mm0 \n\t"
3904 "pcmpgtw %%mm6, %%mm0 \n\t" // create mask pcv bytes < 0
3905 "pand %%mm6, %%mm0 \n\t" // only pav bytes < 0 in mm7
3906 "psubw %%mm7, %%mm5 \n\t"
3907 "psubw %%mm0, %%mm6 \n\t"
3909 "movq %%mm4, %%mm7 \n\t"
3910 "psubw %%mm0, %%mm6 \n\t"
3911 "pcmpgtw %%mm5, %%mm7 \n\t" // pa > pb?
3912 "movq %%mm7, %%mm0 \n\t"
3913 // use mm7 mask to merge pa & pb
3914 "pand %%mm7, %%mm5 \n\t"
3915 // use mm0 mask copy to merge a & b
3916 "pand %%mm0, %%mm2 \n\t"
3917 "pandn %%mm4, %%mm7 \n\t"
3918 "pandn %%mm1, %%mm0 \n\t"
3919 "paddw %%mm5, %%mm7 \n\t"
3920 "paddw %%mm2, %%mm0 \n\t"
3921 // test ((pa <= pb)? pa:pb) <= pc
3922 "pcmpgtw %%mm6, %%mm7 \n\t" // pab > pc?
3923 "pxor %%mm1, %%mm1 \n\t"
3924 "pand %%mm7, %%mm3 \n\t"
3925 "pandn %%mm0, %%mm7 \n\t"
3926 "pxor %%mm1, %%mm1 \n\t"
3927 "paddw %%mm3, %%mm7 \n\t"
3928 "pxor %%mm0, %%mm0 \n\t"
3929 // step ecx to next set of 8 bytes and repeat loop til done
3930 "addl $8, %%ecx \n\t"
3931 "packuswb %%mm7, %%mm1 \n\t"
3932 "paddb -8(%%edi,%%ecx,), %%mm1 \n\t" // add Paeth predictor with Raw(x)
3933 "cmpl _MMXLength, %%ecx \n\t"
3934 "movq %%mm1, -8(%%edi,%%ecx,) \n\t" // write back updated value
3935 // mm1 will be used as Raw(x-bpp) next loop
3938 : "=S" (dummy_value_S), // output regs (dummy)
3939 "=D" (dummy_value_D)
3941 : "0" (prev_row), // esi // input regs
3944 : "%ecx" // clobber list
3945 #if 0 /* %mm0, ..., %mm7 not supported by gcc 2.7.2.3 or egcs 1.1 */
3946 , "%mm0", "%mm1", "%mm2", "%mm3"
3947 , "%mm4", "%mm5", "%mm6", "%mm7"
3957 __asm__ __volatile__ (
3959 "pushl %%ebx \n\t" // save Global Offset Table index
3961 "movl _dif, %%ebx \n\t"
3962 "cmpl _FullLength, %%ebx \n\t"
3963 "jnb paeth_dend \n\t"
3965 // preload "movl row, %%edi \n\t"
3966 // preload "movl prev_row, %%esi \n\t"
3967 // do Paeth decode for remaining bytes
3968 "movl %%ebx, %%edx \n\t"
3969 // preload "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
3970 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
3971 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx
3974 "xorl %%eax, %%eax \n\t"
3975 // pav = p - a = (a + b - c) - a = b - c
3976 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
3977 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
3978 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3979 "movl %%eax, _patemp \n\t" // Save pav for later use
3980 "xorl %%eax, %%eax \n\t"
3981 // pbv = p - b = (a + b - c) - b = a - c
3982 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
3983 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
3984 "movl %%eax, %%ecx \n\t"
3985 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
3986 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
3988 "testl $0x80000000, %%eax \n\t"
3989 "jz paeth_dpca \n\t"
3990 "negl %%eax \n\t" // reverse sign of neg values
3993 "movl %%eax, _pctemp \n\t" // save pc for later use
3995 "testl $0x80000000, %%ecx \n\t"
3996 "jz paeth_dpba \n\t"
3997 "negl %%ecx \n\t" // reverse sign of neg values
4000 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4002 "movl _patemp, %%eax \n\t"
4003 "testl $0x80000000, %%eax \n\t"
4004 "jz paeth_dpaa \n\t"
4005 "negl %%eax \n\t" // reverse sign of neg values
4008 "movl %%eax, _patemp \n\t" // save pa for later use
4010 "cmpl %%ecx, %%eax \n\t"
4011 "jna paeth_dabb \n\t"
4012 // pa > pb; now test if pb <= pc
4013 "cmpl _pctemp, %%ecx \n\t"
4014 "jna paeth_dbbc \n\t"
4015 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4016 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4017 "jmp paeth_dpaeth \n\t"
4020 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4021 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4022 "jmp paeth_dpaeth \n\t"
4025 // pa <= pb; now test if pa <= pc
4026 "cmpl _pctemp, %%eax \n\t"
4027 "jna paeth_dabc \n\t"
4028 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4029 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4030 "jmp paeth_dpaeth \n\t"
4033 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4034 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4036 "paeth_dpaeth: \n\t"
4039 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4040 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4041 "cmpl _FullLength, %%ebx \n\t"
4046 "popl %%ebx \n\t" // index to Global Offset Table
4049 : "=c" (dummy_value_c), // output regs (dummy)
4050 "=S" (dummy_value_S),
4051 "=D" (dummy_value_D)
4053 : "0" (bpp), // ecx // input regs
4054 "1" (prev_row), // esi
4057 : "%eax", "%edx" // clobber list
4063 return; // No need to go further with this one
4065 } // end switch (bpp)
4067 __asm__ __volatile__ (
4068 // MMX acceleration complete; now do clean-up
4069 // check if any remaining bytes left to decode
4071 "pushl %%ebx \n\t" // save index to Global Offset Table
4073 "movl _MMXLength, %%ebx \n\t"
4074 "cmpl _FullLength, %%ebx \n\t"
4075 "jnb paeth_end \n\t"
4076 //pre "movl row, %%edi \n\t"
4077 //pre "movl prev_row, %%esi \n\t"
4078 // do Paeth decode for remaining bytes
4079 "movl %%ebx, %%edx \n\t"
4080 //pre "subl bpp, %%edx \n\t" // (bpp is preloaded into ecx)
4081 "subl %%ecx, %%edx \n\t" // edx = ebx - bpp
4082 "xorl %%ecx, %%ecx \n\t" // zero ecx before using cl & cx below
4085 "xorl %%eax, %%eax \n\t"
4086 // pav = p - a = (a + b - c) - a = b - c
4087 "movb (%%esi,%%ebx,), %%al \n\t" // load Prior(x) into al
4088 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4089 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4090 "movl %%eax, _patemp \n\t" // Save pav for later use
4091 "xorl %%eax, %%eax \n\t"
4092 // pbv = p - b = (a + b - c) - b = a - c
4093 "movb (%%edi,%%edx,), %%al \n\t" // load Raw(x-bpp) into al
4094 "subl %%ecx, %%eax \n\t" // subtract Prior(x-bpp)
4095 "movl %%eax, %%ecx \n\t"
4096 // pcv = p - c = (a + b - c) -c = (a - c) + (b - c) = pav + pbv
4097 "addl _patemp, %%eax \n\t" // pcv = pav + pbv
4099 "testl $0x80000000, %%eax \n\t"
4100 "jz paeth_pca2 \n\t"
4101 "negl %%eax \n\t" // reverse sign of neg values
4104 "movl %%eax, _pctemp \n\t" // save pc for later use
4106 "testl $0x80000000, %%ecx \n\t"
4107 "jz paeth_pba2 \n\t"
4108 "negl %%ecx \n\t" // reverse sign of neg values
4111 "movl %%ecx, _pbtemp \n\t" // save pb for later use
4113 "movl _patemp, %%eax \n\t"
4114 "testl $0x80000000, %%eax \n\t"
4115 "jz paeth_paa2 \n\t"
4116 "negl %%eax \n\t" // reverse sign of neg values
4119 "movl %%eax, _patemp \n\t" // save pa for later use
4121 "cmpl %%ecx, %%eax \n\t"
4122 "jna paeth_abb2 \n\t"
4123 // pa > pb; now test if pb <= pc
4124 "cmpl _pctemp, %%ecx \n\t"
4125 "jna paeth_bbc2 \n\t"
4126 // pb > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4127 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4128 "jmp paeth_paeth2 \n\t"
4131 // pb <= pc; Raw(x) = Paeth(x) + Prior(x)
4132 "movb (%%esi,%%ebx,), %%cl \n\t" // load Prior(x) into cl
4133 "jmp paeth_paeth2 \n\t"
4136 // pa <= pb; now test if pa <= pc
4137 "cmpl _pctemp, %%eax \n\t"
4138 "jna paeth_abc2 \n\t"
4139 // pa > pc; Raw(x) = Paeth(x) + Prior(x-bpp)
4140 "movb (%%esi,%%edx,), %%cl \n\t" // load Prior(x-bpp) into cl
4141 "jmp paeth_paeth2 \n\t"
4144 // pa <= pc; Raw(x) = Paeth(x) + Raw(x-bpp)
4145 "movb (%%edi,%%edx,), %%cl \n\t" // load Raw(x-bpp) into cl
4147 "paeth_paeth2: \n\t"
4150 // Raw(x) = (Paeth(x) + Paeth_Predictor( a, b, c )) mod 256
4151 "addb %%cl, -1(%%edi,%%ebx,) \n\t"
4152 "cmpl _FullLength, %%ebx \n\t"
4156 "EMMS \n\t" // end MMX; prep for poss. FP instrs.
4158 "popl %%ebx \n\t" // restore index to Global Offset Table
4161 : "=c" (dummy_value_c), // output regs (dummy)
4162 "=S" (dummy_value_S),
4163 "=D" (dummy_value_D)
4165 : "0" (bpp), // ecx // input regs
4166 "1" (prev_row), // esi
4169 : "%eax", "%edx" // clobber list (no input regs!)
4175 } /* end png_read_filter_row_mmx_paeth() */
4180 //===========================================================================//
4182 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ S U B //
4184 //===========================================================================//
4186 // Optimized code for PNG Sub filter decoder
4188 static void /* PRIVATE */
4189 png_read_filter_row_mmx_sub(png_row_infop row_info, png_bytep row)
4195 bpp = (row_info->pixel_depth + 7) >> 3; // calc number of bytes per pixel
4196 _FullLength = row_info->rowbytes - bpp; // number of bytes to filter
4198 __asm__ __volatile__ (
4199 //pre "movl row, %%edi \n\t"
4200 "movl %%edi, %%esi \n\t" // lp = row
4201 //pre "movl bpp, %%eax \n\t"
4202 "addl %%eax, %%edi \n\t" // rp = row + bpp
4203 //irr "xorl %%eax, %%eax \n\t"
4204 // get # of bytes to alignment
4205 "movl %%edi, _dif \n\t" // take start of row
4206 "addl $0xf, _dif \n\t" // add 7 + 8 to incr past
4207 // alignment boundary
4208 "xorl %%ecx, %%ecx \n\t"
4209 "andl $0xfffffff8, _dif \n\t" // mask to alignment boundary
4210 "subl %%edi, _dif \n\t" // subtract from start ==> value
4211 "jz sub_go \n\t" // ecx at alignment
4213 "sub_lp1: \n\t" // fix alignment
4214 "movb (%%esi,%%ecx,), %%al \n\t"
4215 "addb %%al, (%%edi,%%ecx,) \n\t"
4217 "cmpl _dif, %%ecx \n\t"
4221 "movl _FullLength, %%eax \n\t"
4222 "movl %%eax, %%edx \n\t"
4223 "subl %%ecx, %%edx \n\t" // subtract alignment fix
4224 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4225 "subl %%edx, %%eax \n\t" // drop over bytes from length
4226 "movl %%eax, _MMXLength \n\t"
4228 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4229 "=D" (dummy_value_D) // 1
4231 : "0" (bpp), // eax // input regs
4234 : "%ebx", "%ecx", "%edx" // clobber list
4237 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4238 , "%mm0", "%mm1", "%mm2", "%mm3"
4239 , "%mm4", "%mm5", "%mm6", "%mm7"
4243 // now do the math for the rest of the row
4248 _ActiveMask.use = 0x0000ffffff000000LL;
4249 _ShiftBpp.use = 24; // == 3 * 8
4250 _ShiftRem.use = 40; // == 64 - 24
4252 __asm__ __volatile__ (
4253 // preload "movl row, %%edi \n\t"
4254 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4255 // active byte group
4256 "movl %%edi, %%esi \n\t" // lp = row
4257 // preload "movl bpp, %%eax \n\t"
4258 "addl %%eax, %%edi \n\t" // rp = row + bpp
4259 "movq %%mm7, %%mm6 \n\t"
4260 "movl _dif, %%edx \n\t"
4261 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4262 // 3rd active byte group
4263 // prime the pump: load the first Raw(x-bpp) data set
4264 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4266 "sub_3lp: \n\t" // shift data for adding first
4267 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4268 // shift clears inactive bytes)
4269 // add 1st active group
4270 "movq (%%edi,%%edx,), %%mm0 \n\t"
4271 "paddb %%mm1, %%mm0 \n\t"
4273 // add 2nd active group
4274 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4275 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4276 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4277 "paddb %%mm1, %%mm0 \n\t"
4279 // add 3rd active group
4280 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4281 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4282 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4283 "addl $8, %%edx \n\t"
4284 "paddb %%mm1, %%mm0 \n\t"
4286 "cmpl _MMXLength, %%edx \n\t"
4287 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4288 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4291 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4292 "=D" (dummy_value_D) // 1
4294 : "0" (bpp), // eax // input regs
4297 : "%edx", "%esi" // clobber list
4298 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4299 , "%mm0", "%mm1", "%mm6", "%mm7"
4307 __asm__ __volatile__ (
4308 "movl _dif, %%edx \n\t"
4309 // preload "movl row, %%edi \n\t"
4310 "cmpl _FullLength, %%edx \n\t"
4312 "movl %%edi, %%esi \n\t" // lp = row
4313 "xorl %%eax, %%eax \n\t"
4314 // preload "movl bpp, %%eax \n\t"
4315 "addl %%eax, %%edi \n\t" // rp = row + bpp
4318 "movb (%%esi,%%edx,), %%al \n\t"
4319 "addb %%al, (%%edi,%%edx,) \n\t"
4321 "cmpl _FullLength, %%edx \n\t"
4326 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4327 "=D" (dummy_value_D) // 1
4329 : "0" (bpp), // eax // input regs
4332 : "%edx", "%esi" // clobber list
4339 //case 7: // GRR BOGUS
4340 //case 5: // GRR BOGUS
4342 _ShiftBpp.use = bpp << 3;
4343 _ShiftRem.use = 64 - _ShiftBpp.use;
4345 __asm__ __volatile__ (
4346 // preload "movl row, %%edi \n\t"
4347 "movl _dif, %%edx \n\t"
4348 "movl %%edi, %%esi \n\t" // lp = row
4349 // preload "movl bpp, %%eax \n\t"
4350 "addl %%eax, %%edi \n\t" // rp = row + bpp
4352 // prime the pump: load the first Raw(x-bpp) data set
4353 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4355 "sub_4lp: \n\t" // shift data for adding first
4356 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4357 // shift clears inactive bytes)
4358 "movq (%%edi,%%edx,), %%mm0 \n\t"
4359 "paddb %%mm1, %%mm0 \n\t"
4361 // add 2nd active group
4362 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4363 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4364 "addl $8, %%edx \n\t"
4365 "paddb %%mm1, %%mm0 \n\t"
4367 "cmpl _MMXLength, %%edx \n\t"
4368 "movq %%mm0, -8(%%edi,%%edx,) \n\t"
4369 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4372 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4373 "=D" (dummy_value_D) // 1
4375 : "0" (bpp), // eax // input regs
4378 : "%edx", "%esi" // clobber list
4379 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4388 _ActiveMask.use = 0x00000000ffff0000LL;
4389 _ShiftBpp.use = 16; // == 2 * 8
4390 _ShiftRem.use = 48; // == 64 - 16
4392 __asm__ __volatile__ (
4393 "movq _ActiveMask, %%mm7 \n\t" // load _ActiveMask for 2nd
4394 // active byte group
4395 "movl _dif, %%edx \n\t"
4396 "movq %%mm7, %%mm6 \n\t"
4397 // preload "movl row, %%edi \n\t"
4398 "psllq _ShiftBpp, %%mm6 \n\t" // move mask in mm6 to cover
4399 // 3rd active byte group
4400 "movl %%edi, %%esi \n\t" // lp = row
4401 "movq %%mm6, %%mm5 \n\t"
4402 // preload "movl bpp, %%eax \n\t"
4403 "addl %%eax, %%edi \n\t" // rp = row + bpp
4404 "psllq _ShiftBpp, %%mm5 \n\t" // move mask in mm5 to cover
4405 // 4th active byte group
4406 // prime the pump: load the first Raw(x-bpp) data set
4407 "movq -8(%%edi,%%edx,), %%mm1 \n\t"
4409 "sub_2lp: \n\t" // shift data for adding first
4410 "psrlq _ShiftRem, %%mm1 \n\t" // bpp bytes (no need for mask;
4411 // shift clears inactive bytes)
4412 // add 1st active group
4413 "movq (%%edi,%%edx,), %%mm0 \n\t"
4414 "paddb %%mm1, %%mm0 \n\t"
4416 // add 2nd active group
4417 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4418 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4419 "pand %%mm7, %%mm1 \n\t" // mask to use 2nd active group
4420 "paddb %%mm1, %%mm0 \n\t"
4422 // add 3rd active group
4423 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4424 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4425 "pand %%mm6, %%mm1 \n\t" // mask to use 3rd active group
4426 "paddb %%mm1, %%mm0 \n\t"
4428 // add 4th active group
4429 "movq %%mm0, %%mm1 \n\t" // mov updated Raws to mm1
4430 "psllq _ShiftBpp, %%mm1 \n\t" // shift data to pos. correctly
4431 "pand %%mm5, %%mm1 \n\t" // mask to use 4th active group
4432 "addl $8, %%edx \n\t"
4433 "paddb %%mm1, %%mm0 \n\t"
4434 "cmpl _MMXLength, %%edx \n\t"
4435 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // write updated Raws to array
4436 "movq %%mm0, %%mm1 \n\t" // prep 1st add at top of loop
4439 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4440 "=D" (dummy_value_D) // 1
4442 : "0" (bpp), // eax // input regs
4445 : "%edx", "%esi" // clobber list
4446 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4447 , "%mm0", "%mm1", "%mm5", "%mm6", "%mm7"
4455 __asm__ __volatile__ (
4456 // preload "movl row, %%edi \n\t"
4457 "movl _dif, %%edx \n\t"
4458 "movl %%edi, %%esi \n\t" // lp = row
4459 // preload "movl bpp, %%eax \n\t"
4460 "addl %%eax, %%edi \n\t" // rp = row + bpp
4461 "movl _MMXLength, %%ecx \n\t"
4463 // prime the pump: load the first Raw(x-bpp) data set
4464 "movq -8(%%edi,%%edx,), %%mm7 \n\t"
4465 "andl $0x0000003f, %%ecx \n\t" // calc bytes over mult of 64
4468 "movq (%%edi,%%edx,), %%mm0 \n\t" // load Sub(x) for 1st 8 bytes
4469 "paddb %%mm7, %%mm0 \n\t"
4470 "movq 8(%%edi,%%edx,), %%mm1 \n\t" // load Sub(x) for 2nd 8 bytes
4471 "movq %%mm0, (%%edi,%%edx,) \n\t" // write Raw(x) for 1st 8 bytes
4473 // Now mm0 will be used as Raw(x-bpp) for the 2nd group of 8 bytes.
4474 // This will be repeated for each group of 8 bytes with the 8th
4475 // group being used as the Raw(x-bpp) for the 1st group of the
4478 "paddb %%mm0, %%mm1 \n\t"
4479 "movq 16(%%edi,%%edx,), %%mm2 \n\t" // load Sub(x) for 3rd 8 bytes
4480 "movq %%mm1, 8(%%edi,%%edx,) \n\t" // write Raw(x) for 2nd 8 bytes
4481 "paddb %%mm1, %%mm2 \n\t"
4482 "movq 24(%%edi,%%edx,), %%mm3 \n\t" // load Sub(x) for 4th 8 bytes
4483 "movq %%mm2, 16(%%edi,%%edx,) \n\t" // write Raw(x) for 3rd 8 bytes
4484 "paddb %%mm2, %%mm3 \n\t"
4485 "movq 32(%%edi,%%edx,), %%mm4 \n\t" // load Sub(x) for 5th 8 bytes
4486 "movq %%mm3, 24(%%edi,%%edx,) \n\t" // write Raw(x) for 4th 8 bytes
4487 "paddb %%mm3, %%mm4 \n\t"
4488 "movq 40(%%edi,%%edx,), %%mm5 \n\t" // load Sub(x) for 6th 8 bytes
4489 "movq %%mm4, 32(%%edi,%%edx,) \n\t" // write Raw(x) for 5th 8 bytes
4490 "paddb %%mm4, %%mm5 \n\t"
4491 "movq 48(%%edi,%%edx,), %%mm6 \n\t" // load Sub(x) for 7th 8 bytes
4492 "movq %%mm5, 40(%%edi,%%edx,) \n\t" // write Raw(x) for 6th 8 bytes
4493 "paddb %%mm5, %%mm6 \n\t"
4494 "movq 56(%%edi,%%edx,), %%mm7 \n\t" // load Sub(x) for 8th 8 bytes
4495 "movq %%mm6, 48(%%edi,%%edx,) \n\t" // write Raw(x) for 7th 8 bytes
4496 "addl $64, %%edx \n\t"
4497 "paddb %%mm6, %%mm7 \n\t"
4498 "cmpl %%ecx, %%edx \n\t"
4499 "movq %%mm7, -8(%%edi,%%edx,) \n\t" // write Raw(x) for 8th 8 bytes
4502 "cmpl _MMXLength, %%edx \n\t"
4506 "movq (%%edi,%%edx,), %%mm0 \n\t"
4507 "addl $8, %%edx \n\t"
4508 "paddb %%mm7, %%mm0 \n\t"
4509 "cmpl _MMXLength, %%edx \n\t"
4510 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // -8 to offset early addl edx
4511 "movq %%mm0, %%mm7 \n\t" // move calculated Raw(x) data
4512 // to mm1 to be new Raw(x-bpp)
4518 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4519 "=D" (dummy_value_D) // 1
4521 : "0" (bpp), // eax // input regs
4524 : "%ecx", "%edx", "%esi" // clobber list
4525 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4526 , "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7"
4532 default: // bpp greater than 8 bytes GRR BOGUS
4534 __asm__ __volatile__ (
4535 "movl _dif, %%edx \n\t"
4536 // preload "movl row, %%edi \n\t"
4537 "movl %%edi, %%esi \n\t" // lp = row
4538 // preload "movl bpp, %%eax \n\t"
4539 "addl %%eax, %%edi \n\t" // rp = row + bpp
4542 "movq (%%edi,%%edx,), %%mm0 \n\t"
4543 "movq (%%esi,%%edx,), %%mm1 \n\t"
4544 "addl $8, %%edx \n\t"
4545 "paddb %%mm1, %%mm0 \n\t"
4546 "cmpl _MMXLength, %%edx \n\t"
4547 "movq %%mm0, -8(%%edi,%%edx,) \n\t" // mov does not affect flags;
4548 // -8 to offset addl edx
4551 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4552 "=D" (dummy_value_D) // 1
4554 : "0" (bpp), // eax // input regs
4557 : "%edx", "%esi" // clobber list
4558 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4565 } // end switch (bpp)
4567 __asm__ __volatile__ (
4568 "movl _MMXLength, %%edx \n\t"
4569 //pre "movl row, %%edi \n\t"
4570 "cmpl _FullLength, %%edx \n\t"
4573 "movl %%edi, %%esi \n\t" // lp = row
4574 //pre "movl bpp, %%eax \n\t"
4575 "addl %%eax, %%edi \n\t" // rp = row + bpp
4576 "xorl %%eax, %%eax \n\t"
4579 "movb (%%esi,%%edx,), %%al \n\t"
4580 "addb %%al, (%%edi,%%edx,) \n\t"
4582 "cmpl _FullLength, %%edx \n\t"
4586 "EMMS \n\t" // end MMX instructions
4588 : "=a" (dummy_value_a), // 0 // output regs (dummy)
4589 "=D" (dummy_value_D) // 1
4591 : "0" (bpp), // eax // input regs
4594 : "%edx", "%esi" // clobber list
4597 } // end of png_read_filter_row_mmx_sub()
4602 //===========================================================================//
4604 // P N G _ R E A D _ F I L T E R _ R O W _ M M X _ U P //
4606 //===========================================================================//
4608 // Optimized code for PNG Up filter decoder
4610 static void /* PRIVATE */
4611 png_read_filter_row_mmx_up(png_row_infop row_info, png_bytep row,
4615 int dummy_value_d; // fix 'forbidden register 3 (dx) was spilled' error
4619 len = row_info->rowbytes; // number of bytes to filter
4621 __asm__ __volatile__ (
4622 //pre "movl row, %%edi \n\t"
4623 // get # of bytes to alignment
4624 "movl %%edi, %%ecx \n\t"
4625 "xorl %%ebx, %%ebx \n\t"
4626 "addl $0x7, %%ecx \n\t"
4627 "xorl %%eax, %%eax \n\t"
4628 "andl $0xfffffff8, %%ecx \n\t"
4629 //pre "movl prev_row, %%esi \n\t"
4630 "subl %%edi, %%ecx \n\t"
4633 "up_lp1: \n\t" // fix alignment
4634 "movb (%%edi,%%ebx,), %%al \n\t"
4635 "addb (%%esi,%%ebx,), %%al \n\t"
4637 "cmpl %%ecx, %%ebx \n\t"
4638 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4639 "jb up_lp1 \n\t" // offset incl ebx
4642 //pre "movl len, %%edx \n\t"
4643 "movl %%edx, %%ecx \n\t"
4644 "subl %%ebx, %%edx \n\t" // subtract alignment fix
4645 "andl $0x0000003f, %%edx \n\t" // calc bytes over mult of 64
4646 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4648 // unrolled loop - use all MMX registers and interleave to reduce
4649 // number of branch instructions (loops) and reduce partial stalls
4651 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4652 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4653 "movq 8(%%esi,%%ebx,), %%mm3 \n\t"
4654 "paddb %%mm1, %%mm0 \n\t"
4655 "movq 8(%%edi,%%ebx,), %%mm2 \n\t"
4656 "movq %%mm0, (%%edi,%%ebx,) \n\t"
4657 "paddb %%mm3, %%mm2 \n\t"
4658 "movq 16(%%esi,%%ebx,), %%mm5 \n\t"
4659 "movq %%mm2, 8(%%edi,%%ebx,) \n\t"
4660 "movq 16(%%edi,%%ebx,), %%mm4 \n\t"
4661 "movq 24(%%esi,%%ebx,), %%mm7 \n\t"
4662 "paddb %%mm5, %%mm4 \n\t"
4663 "movq 24(%%edi,%%ebx,), %%mm6 \n\t"
4664 "movq %%mm4, 16(%%edi,%%ebx,) \n\t"
4665 "paddb %%mm7, %%mm6 \n\t"
4666 "movq 32(%%esi,%%ebx,), %%mm1 \n\t"
4667 "movq %%mm6, 24(%%edi,%%ebx,) \n\t"
4668 "movq 32(%%edi,%%ebx,), %%mm0 \n\t"
4669 "movq 40(%%esi,%%ebx,), %%mm3 \n\t"
4670 "paddb %%mm1, %%mm0 \n\t"
4671 "movq 40(%%edi,%%ebx,), %%mm2 \n\t"
4672 "movq %%mm0, 32(%%edi,%%ebx,) \n\t"
4673 "paddb %%mm3, %%mm2 \n\t"
4674 "movq 48(%%esi,%%ebx,), %%mm5 \n\t"
4675 "movq %%mm2, 40(%%edi,%%ebx,) \n\t"
4676 "movq 48(%%edi,%%ebx,), %%mm4 \n\t"
4677 "movq 56(%%esi,%%ebx,), %%mm7 \n\t"
4678 "paddb %%mm5, %%mm4 \n\t"
4679 "movq 56(%%edi,%%ebx,), %%mm6 \n\t"
4680 "movq %%mm4, 48(%%edi,%%ebx,) \n\t"
4681 "addl $64, %%ebx \n\t"
4682 "paddb %%mm7, %%mm6 \n\t"
4683 "cmpl %%ecx, %%ebx \n\t"
4684 "movq %%mm6, -8(%%edi,%%ebx,) \n\t" // (+56)movq does not affect flags;
4685 "jb up_loop \n\t" // -8 to offset addl ebx
4687 "cmpl $0, %%edx \n\t" // test for bytes over mult of 64
4690 "cmpl $8, %%edx \n\t" // test for less than 8 bytes
4691 "jb up_lt8 \n\t" // [added by lcreeve@netins.net]
4693 "addl %%edx, %%ecx \n\t"
4694 "andl $0x00000007, %%edx \n\t" // calc bytes over mult of 8
4695 "subl %%edx, %%ecx \n\t" // drop over bytes from length
4698 "up_lpA: \n\t" // use MMX regs to update 8 bytes sim.
4699 "movq (%%esi,%%ebx,), %%mm1 \n\t"
4700 "movq (%%edi,%%ebx,), %%mm0 \n\t"
4701 "addl $8, %%ebx \n\t"
4702 "paddb %%mm1, %%mm0 \n\t"
4703 "cmpl %%ecx, %%ebx \n\t"
4704 "movq %%mm0, -8(%%edi,%%ebx,) \n\t" // movq does not affect flags; -8 to
4705 "jb up_lpA \n\t" // offset add ebx
4706 "cmpl $0, %%edx \n\t" // test for bytes over mult of 8
4710 "xorl %%eax, %%eax \n\t"
4711 "addl %%edx, %%ecx \n\t" // move over byte count into counter
4713 "up_lp2: \n\t" // use x86 regs for remaining bytes
4714 "movb (%%edi,%%ebx,), %%al \n\t"
4715 "addb (%%esi,%%ebx,), %%al \n\t"
4717 "cmpl %%ecx, %%ebx \n\t"
4718 "movb %%al, -1(%%edi,%%ebx,) \n\t" // mov does not affect flags; -1 to
4719 "jb up_lp2 \n\t" // offset inc ebx
4722 "EMMS \n\t" // conversion of filtered row complete
4724 : "=d" (dummy_value_d), // 0 // output regs (dummy)
4725 "=S" (dummy_value_S), // 1
4726 "=D" (dummy_value_D) // 2
4728 : "0" (len), // edx // input regs
4729 "1" (prev_row), // esi
4732 : "%eax", "%ebx", "%ecx" // clobber list (no input regs!)
4734 #if 0 /* MMX regs (%mm0, etc.) not supported by gcc 2.7.2.3 or egcs 1.1 */
4735 , "%mm0", "%mm1", "%mm2", "%mm3"
4736 , "%mm4", "%mm5", "%mm6", "%mm7"
4740 } // end of png_read_filter_row_mmx_up()
4745 //===========================================================================//
4747 // P N G _ R E A D _ F I L T E R _ R O W //
4749 //===========================================================================//
4751 #if defined(PNG_HAVE_ASSEMBLER_READ_FILTER_ROW)
4753 // Optimized png_read_filter_row routines
4756 png_read_filter_row(png_structp png_ptr, png_row_infop row_info, png_bytep
4757 row, png_bytep prev_row, int filter)
4763 /* GRR: these are superseded by png_ptr->asm_flags: */
4764 #define UseMMX_sub 1 // GRR: converted 20000730
4765 #define UseMMX_up 1 // GRR: converted 20000729
4766 #define UseMMX_avg 1 // GRR: converted 20000828 (+ 16-bit bugfix 20000916)
4767 #define UseMMX_paeth 1 // GRR: converted 20000828
4769 if (_mmx_supported == 2) {
4774 png_debug(1, "in png_read_filter_row\n");
4777 case 0: sprintf(filnm, "none");
4779 case 1: sprintf(filnm, "sub-%s", "MMX");
4781 case 2: sprintf(filnm, "up-%s", "MMX");
4783 case 3: sprintf(filnm, "avg-%s", "MMX");
4785 case 4: sprintf(filnm, "Paeth-%s", "MMX");
4787 default: sprintf(filnm, "unknw");
4790 png_debug2(0, "row_number=%5ld, %5s, ", png_ptr->row_number, filnm);
4791 png_debug1(0, "row=0x%08lx, ", (unsigned long)row);
4792 png_debug2(0, "pixdepth=%2d, bytes=%d, ", (int)row_info->pixel_depth,
4793 (int)((row_info->pixel_depth + 7) >> 3));
4794 png_debug1(0,"rowbytes=%8ld\n", row_info->rowbytes);
4795 #endif /* PNG_DEBUG */
4799 case PNG_FILTER_VALUE_NONE:
4802 case PNG_FILTER_VALUE_SUB:
4804 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4805 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4807 png_read_filter_row_mmx_sub(row_info, row);
4812 png_uint_32 istop = row_info->rowbytes;
4813 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4814 png_bytep rp = row + bpp;
4817 for (i = bpp; i < istop; i++)
4819 *rp = (png_byte)(((int)(*rp) + (int)(*lp++)) & 0xff);
4825 case PNG_FILTER_VALUE_UP:
4827 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4828 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4830 png_read_filter_row_mmx_up(row_info, row, prev_row);
4835 png_uint_32 istop = row_info->rowbytes;
4837 png_bytep pp = prev_row;
4839 for (i = 0; i < istop; ++i)
4841 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4847 case PNG_FILTER_VALUE_AVG:
4849 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4850 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4852 png_read_filter_row_mmx_avg(row_info, row, prev_row);
4858 png_bytep pp = prev_row;
4860 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4861 png_uint_32 istop = row_info->rowbytes - bpp;
4863 for (i = 0; i < bpp; i++)
4865 *rp = (png_byte)(((int)(*rp) +
4866 ((int)(*pp++) >> 1)) & 0xff);
4870 for (i = 0; i < istop; i++)
4872 *rp = (png_byte)(((int)(*rp) +
4873 ((int)(*pp++ + *lp++) >> 1)) & 0xff);
4879 case PNG_FILTER_VALUE_PAETH:
4881 (row_info->pixel_depth >= PNG_MMX_BITDEPTH_THRESHOLD_DEFAULT) &&
4882 (row_info->rowbytes >= PNG_MMX_ROWBYTES_THRESHOLD_DEFAULT))
4884 png_read_filter_row_mmx_paeth(row_info, row, prev_row);
4890 png_bytep pp = prev_row;
4892 png_bytep cp = prev_row;
4893 png_uint_32 bpp = (row_info->pixel_depth + 7) >> 3;
4894 png_uint_32 istop = row_info->rowbytes - bpp;
4896 for (i = 0; i < bpp; i++)
4898 *rp = (png_byte)(((int)(*rp) + (int)(*pp++)) & 0xff);
4902 for (i = 0; i < istop; i++) /* use leftover rp,pp */
4904 int a, b, c, pa, pb, pc, p;
4918 pa = p < 0 ? -p : p;
4919 pb = pc < 0 ? -pc : pc;
4920 pc = (p + pc) < 0 ? -(p + pc) : p + pc;
4924 if (pa <= pb && pa <= pc)
4932 p = (pa <= pb && pa <= pc) ? a : (pb <= pc) ? b : c;
4934 *rp = (png_byte)(((int)(*rp) + p) & 0xff);
4937 } //end !UseMMX_paeth
4941 png_warning(png_ptr, "Ignoring bad row-filter type");
4947 #endif /* PNG_HAVE_ASSEMBLER_READ_FILTER_ROW */
4952 //===========================================================================//
4954 // P N G _ M M X _ S U P P O R T //
4956 //===========================================================================//
4958 // GRR NOTES: (1) the following code assumes 386 or better (pushfl/popfl)
4959 // (2) all instructions compile with gcc 2.7.2.3 and later
4960 // (3) the function is moved down here to prevent gcc from
4961 // inlining it in multiple places and then barfing be-
4962 // cause the ".NOT_SUPPORTED" label is multiply defined
4963 // [is there a way to signal that a *single* function should
4964 // not be inlined? is there a way to modify the label for
4965 // each inlined instance, e.g., by appending _1, _2, etc.?
4966 // maybe if don't use leading "." in label name? (nope...sigh)]
4968 // GRR TO DO: make sure PNGAPI doesn't do/require anything screwy here
4969 // [looks OK for everybody except possibly Cygwin (__cdecl)]
4972 png_mmx_support(void)
4974 __asm__ __volatile__ (
4975 "pushl %%ebx \n\t" // ebx gets clobbered by CPUID instruction
4976 "pushl %%ecx \n\t" // so does ecx...
4977 "pushl %%edx \n\t" // ...and edx (but ecx & edx safe on Linux)
4978 // ".byte 0x66 \n\t" // convert 16-bit pushf to 32-bit pushfd
4979 // "pushf \n\t" // 16-bit pushf
4980 "pushfl \n\t" // save Eflag to stack
4981 "popl %%eax \n\t" // get Eflag from stack into eax
4982 "movl %%eax, %%ecx \n\t" // make another copy of Eflag in ecx
4983 "xorl $0x200000, %%eax \n\t" // toggle ID bit in Eflag (i.e., bit 21)
4984 "pushl %%eax \n\t" // save modified Eflag back to stack
4985 // ".byte 0x66 \n\t" // convert 16-bit popf to 32-bit popfd
4986 // "popf \n\t" // 16-bit popf
4987 "popfl \n\t" // restore modified value to Eflag reg
4988 "pushfl \n\t" // save Eflag to stack
4989 "popl %%eax \n\t" // get Eflag from stack
4990 "xorl %%ecx, %%eax \n\t" // compare new Eflag with original Eflag
4991 "jz .NOT_SUPPORTED \n\t" // if same, CPUID instr. is not supported
4993 "xorl %%eax, %%eax \n\t" // set eax to zero
4994 // ".byte 0x0f, 0xa2 \n\t" // CPUID instruction (two-byte opcode)
4995 "cpuid \n\t" // get the CPU identification info
4996 "cmpl $1, %%eax \n\t" // make sure eax return non-zero value
4997 "jl .NOT_SUPPORTED \n\t" // if eax is zero, MMX is not supported
4999 "xorl %%eax, %%eax \n\t" // set eax to zero and...
5000 "incl %%eax \n\t" // ...increment eax to 1. This pair is
5001 // faster than the instruction "mov eax, 1"
5002 "cpuid \n\t" // get the CPU identification info again
5003 "andl $0x800000, %%edx \n\t" // mask out all bits but MMX bit (23)
5004 "cmpl $0, %%edx \n\t" // 0 = MMX not supported
5005 "jz .NOT_SUPPORTED \n\t" // non-zero = yes, MMX IS supported
5007 "movl $1, %%eax \n\t" // set return value to 1
5008 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5009 "popl %%edx \n\t" // restore edx
5010 "popl %%ecx \n\t" // restore ecx
5011 "popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
5012 "ret \n\t" // DONE: have MMX support
5014 ".NOT_SUPPORTED: \n\t" // target label for jump instructions
5015 "movl $0, %%eax \n\t" // set return value to 0
5016 "movl %%eax, _mmx_supported \n\t" // save in global static variable, too
5017 "popl %%edx \n\t" // restore edx
5018 "popl %%ecx \n\t" // restore ecx
5019 "popl %%ebx \n\t" // restore ebx ("row" in png_do_interlace)
5020 // "ret \n\t" // DONE: no MMX support
5021 // (fall through to standard C "ret")
5023 : // output list (none)
5025 : // any variables used on input (none)
5027 : "%eax" // clobber list
5028 // , "%ebx", "%ecx", "%edx" // GRR: we handle these manually
5029 // , "memory" // if write to a variable gcc thought was in a reg
5030 // , "cc" // "condition codes" (flag bits)
5036 #endif /* PNG_ASSEMBLER_CODE_SUPPORTED && PNG_USE_PNGGCCRD */