Merge pull request #39 from notaz/asm_fixes

Asm fixes
This commit is contained in:
Ryan C. Gordon 2016-04-21 22:46:07 -04:00
commit 10395909e8
12 changed files with 332 additions and 289 deletions

View File

@ -161,6 +161,11 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
#endif #endif
#endif #endif
#if defined(__GNU_INLINE__) && defined(__i386__)
#define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
#define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
#endif
#ifndef PAGESIZE #ifndef PAGESIZE
#define PAGESIZE 4096 #define PAGESIZE 4096
#endif #endif

View File

@ -426,7 +426,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
"orl %%eax, %%ebx \n\t" "orl %%eax, %%ebx \n\t"
"movl %%ebx, %%ecx \n\t" "movl %%ebx, %%ecx \n\t"
"popl %%ebx \n\t" "popl %%ebx \n\t"
: "=c" (colRet) : "=&c" (colRet)
: "S" (col1), "D" (col2) : "S" (col1), "D" (col2)
: "eax", "edx", "cc", "memory" : "eax", "edx", "cc", "memory"
); );
@ -536,18 +536,18 @@ COLOR AddColors( COLOR col1, COLOR col2)
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp;
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // if xbx is "r", gcc runs out of regs in -fPIC + -fno-omit-fp :(
"pushl %%edi \n\t" //"xorl %[xbx], %[xbx] \n\t"
"pushl %%esi \n\t" "movl $0, %[xbx] \n\t"
"xorl %%ebx, %%ebx \n\t"
"mov $255, %%esi \n\t" "mov $255, %%esi \n\t"
// red // red
"movl (%%esp), %%eax \n\t" "movl %[col1], %%eax \n\t"
"andl $0xFF000000, %%eax \n\t" "andl $0xFF000000, %%eax \n\t"
"shrl $24, %%eax \n\t" "shrl $24, %%eax \n\t"
"movl 4(%%esp), %%edx \n\t" "movl %[col2], %%edx \n\t"
"andl $0xFF000000, %%edx \n\t" "andl $0xFF000000, %%edx \n\t"
"shrl $24, %%edx \n\t" "shrl $24, %%edx \n\t"
"addl %%edx, %%eax \n\t" "addl %%edx, %%eax \n\t"
@ -556,13 +556,13 @@ COLOR AddColors( COLOR col1, COLOR col2)
"orl %%ecx, %%eax \n\t" "orl %%ecx, %%eax \n\t"
"shll $24, %%eax \n\t" "shll $24, %%eax \n\t"
"andl $0xFF000000, %%eax \n\t" "andl $0xFF000000, %%eax \n\t"
"orl %%eax, %%ebx \n\t" "orl %%eax, %[xbx] \n\t"
// green // green
"movl (%%esp), %%eax \n\t" "movl %[col1], %%eax \n\t"
"andl $0x00FF0000, %%eax \n\t" "andl $0x00FF0000, %%eax \n\t"
"shrl $16, %%eax \n\t" "shrl $16, %%eax \n\t"
"movl 4(%%esp), %%edx \n\t" "movl %[col2], %%edx \n\t"
"andl $0x00FF0000, %%edx \n\t" "andl $0x00FF0000, %%edx \n\t"
"shrl $16, %%edx \n\t" "shrl $16, %%edx \n\t"
"addl %%edx, %%eax \n\t" "addl %%edx, %%eax \n\t"
@ -571,13 +571,13 @@ COLOR AddColors( COLOR col1, COLOR col2)
"orl %%ecx, %%eax \n\t" "orl %%ecx, %%eax \n\t"
"shll $16, %%eax \n\t" "shll $16, %%eax \n\t"
"andl $0x00FF0000, %%eax \n\t" "andl $0x00FF0000, %%eax \n\t"
"orl %%eax, %%ebx \n\t" "orl %%eax, %[xbx] \n\t"
// blue // blue
"movl (%%esp), %%eax \n\t" "movl %[col1], %%eax \n\t"
"andl $0x0000FF00, %%eax \n\t" "andl $0x0000FF00, %%eax \n\t"
"shrl $8, %%eax \n\t" "shrl $8, %%eax \n\t"
"movl 4(%%esp), %%edx \n\t" "movl %[col2], %%edx \n\t"
"andl $0x0000FF00, %%edx \n\t" "andl $0x0000FF00, %%edx \n\t"
"shrl $8, %%edx \n\t" "shrl $8, %%edx \n\t"
"addl %%edx, %%eax \n\t" "addl %%edx, %%eax \n\t"
@ -586,13 +586,13 @@ COLOR AddColors( COLOR col1, COLOR col2)
"orl %%ecx, %%eax \n\t" "orl %%ecx, %%eax \n\t"
"shll $8, %%eax \n\t" "shll $8, %%eax \n\t"
"andl $0x0000FF00, %%eax \n\t" "andl $0x0000FF00, %%eax \n\t"
"orl %%eax, %%ebx \n\t" "orl %%eax, %[xbx] \n\t"
// alpha // alpha
"movl (%%esp), %%eax \n\t" "movl %[col1], %%eax \n\t"
"andl $0x000000FF, %%eax \n\t" "andl $0x000000FF, %%eax \n\t"
"shrl $0, %%eax \n\t" "shrl $0, %%eax \n\t"
"movl 4(%%esp), %%edx \n\t" "movl %[col2], %%edx \n\t"
"andl $0x000000FF, %%edx \n\t" "andl $0x000000FF, %%edx \n\t"
"shrl $0, %%edx \n\t" "shrl $0, %%edx \n\t"
"addl %%edx, %%eax \n\t" "addl %%edx, %%eax \n\t"
@ -601,15 +601,10 @@ COLOR AddColors( COLOR col1, COLOR col2)
"orl %%ecx, %%eax \n\t" "orl %%ecx, %%eax \n\t"
"shll $0, %%eax \n\t" "shll $0, %%eax \n\t"
"andl $0x000000FF, %%eax \n\t" "andl $0x000000FF, %%eax \n\t"
"orl %%eax, %%ebx \n\t" "orl %[xbx], %%eax \n\t"
"movl %%ebx, %%ecx \n\t" : "=&a" (colRet), [xbx] "=&g" (tmp)
: [col1] "g" (col1), [col2] "g" (col2)
// done. : "ecx", "edx", "esi", "cc", "memory"
"addl $8, %%esp \n\t"
"popl %%ebx \n\t"
: "=c" (colRet)
: "S" (col1), "D" (col2)
: "eax", "edx", "cc", "memory"
); );
#else #else

View File

@ -332,7 +332,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
"cld \n\t" "cld \n\t"
"rep \n\t" "rep \n\t"
"movsd \n\t" "movsd \n\t"
: // no outputs. : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs)
: "S" (pulSrc), "D" (pulDst), "c" (ctLongs) : "S" (pulSrc), "D" (pulDst), "c" (ctLongs)
: "cc", "memory" : "cc", "memory"
); );
@ -364,7 +364,7 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
"cld \n\t" "cld \n\t"
"rep \n\t" "rep \n\t"
"stosd \n\t" "stosd \n\t"
: // no outputs. : "=D" (pulDst), "=c" (ctLongs)
: "a" (ulVal), "D" (pulDst), "c" (ctLongs) : "a" (ulVal), "D" (pulDst), "c" (ctLongs)
: "cc", "memory" : "cc", "memory"
); );

View File

@ -186,10 +186,10 @@ elemDone:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // Save GCC's register. "movl %[ctElems], %%ecx \n\t"
"movl %%eax, %%ebx \n\t" "movl %[piDst], %%edi \n\t"
"movl %[piElements], %%esi \n\t"
"movd %%ebx, %%mm1 \n\t" "movd %[iVtx0Pass], %%mm1 \n\t"
"movq %%mm1, %%mm0 \n\t" "movq %%mm1, %%mm0 \n\t"
"psllq $32, %%mm1 \n\t" "psllq $32, %%mm1 \n\t"
"por %%mm0, %%mm1 \n\t" "por %%mm0, %%mm1 \n\t"
@ -205,17 +205,18 @@ elemDone:
"jnz 0b \n\t" // elemLoop "jnz 0b \n\t" // elemLoop
"1: \n\t" // elemRest "1: \n\t" // elemRest
"emms \n\t" "emms \n\t"
"testl $1, %%edx \n\t" "testl $1, %[ctElems] \n\t"
"jz 2f \n\t" // elemDone "jz 2f \n\t" // elemDone
"movl (%%esi), %%eax \n\t" "movl (%%esi), %%eax \n\t"
"addl %%ebx, %%eax \n\t" "addl %[iVtx0Pass], %%eax \n\t"
"movl %%eax, (%%edi) \n\t" "movl %%eax, (%%edi) \n\t"
"2: \n\t" // elemDone "2: \n\t" // elemDone
"popl %%ebx \n\t" // restore GCC's register.
: // no outputs. : // no outputs.
: "c" (ctElems), "d" (ctElems), "D" (piDst), : [ctElems] "g" (ctElems), [piDst] "g" (piDst),
"S" (pspo->spo_piElements), "a" (pspo->spo_iVtx0Pass) [piElements] "g" (pspo->spo_piElements),
: "cc", "memory" [iVtx0Pass] "g" (pspo->spo_iVtx0Pass)
: FPU_REGS, "mm0", "mm1", "eax", "ecx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -506,12 +507,13 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl $2, %%eax \n\t"
"bsrl (%%esi), %%ecx \n\t" "bsrl (%%esi), %%ecx \n\t"
"shll %%cl, %%eax \n\t" "shll %%cl, %%eax \n\t"
"movl %%eax, (%%esi) \n\t" "movl %%eax, (%%esi) \n\t"
: // no outputs. : // no outputs.
: "a" (2), "S" (&_ctGroupsCount) : "S" (&_ctGroupsCount)
: "ecx", "cc", "memory" : "eax", "ecx", "cc", "memory"
); );
#else #else

View File

@ -97,6 +97,8 @@ pixLoop:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[pubTexture], %%esi \n\t"
"movl %[pixTextureSize], %%ecx \n\t"
"leal 0(%%esi, %%ecx), %%edi \n\t" "leal 0(%%esi, %%ecx), %%edi \n\t"
"0: \n\t" // pixLoop "0: \n\t" // pixLoop
"movzbl (%%esi), %%eax \n\t" "movzbl (%%esi), %%eax \n\t"
@ -108,8 +110,9 @@ pixLoop:
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 0b \n\t" // pixLoop "jnz 0b \n\t" // pixLoop
: // no outputs. : // no outputs.
: "S" (pubTexture), "D" (pubTexture), "c" (pixTextureSize) : [pubTexture] "g" (pubTexture),
: "eax", "cc", "memory" [pixTextureSize] "g" (pixTextureSize)
: "eax", "ecx", "esi", "edi", "cc", "memory"
); );
#else #else

View File

@ -219,6 +219,9 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pxor %%mm0,%%mm0 \n\t" "pxor %%mm0,%%mm0 \n\t"
"movl %[pulSrc],%%esi \n\t"
"movl %[pulDst],%%edi \n\t"
"movl %[pixSize],%%ecx \n\t"
"0: \n\t" // pixLoop "0: \n\t" // pixLoop
"movd 0(%%esi), %%mm1 \n\t" "movd 0(%%esi), %%mm1 \n\t"
"movd 4(%%esi), %%mm2 \n\t" "movd 4(%%esi), %%mm2 \n\t"
@ -234,8 +237,10 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
"jnz 0b \n\t" // pixLoop "jnz 0b \n\t" // pixLoop
"emms \n\t" "emms \n\t"
: :
: "S" (pulSrc), "D" (pulDst), "c" (pixSize) : [pulSrc] "g" (pulSrc), [pulDst] "g" (pulDst),
: "memory", "cc" [pixSize] "g" (pixSize)
: FPU_REGS, "mm0", "mm1", "mm2",
"ecx", "esi", "edi", "memory", "cc"
); );
#else #else

View File

@ -13,12 +13,6 @@ You should have received a copy of the GNU General Public License along
with this program; if not, write to the Free Software Foundation, Inc., with this program; if not, write to the Free Software Foundation, Inc.,
51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. */
// !!! FIXME: One of the GNU inline asm blocks has a bug that causes the
// !!! FIXME: title on the main menu to render incorrectly. (Generating an
// !!! FIXME: incorrect mipmap?) The intel compiler works fine with the
// !!! FIXME: MSVC inline asm, but GCC and Intel both have the problem when
// !!! FIXME: using the GNU inline asm.
#include "Engine/StdH.h" #include "Engine/StdH.h"
#include <Engine/Base/Statistics_Internal.h> #include <Engine/Base/Statistics_Internal.h>
@ -198,9 +192,9 @@ void FlipBitmap( UBYTE *pubSrc, UBYTE *pubDst, PIX pixWidth, PIX pixHeight, INDE
// makes one level lower mipmap (bilinear or nearest-neighbour with border preservance) // makes one level lower mipmap (bilinear or nearest-neighbour with border preservance)
#if (defined __GNUC__) #if (defined __GNUC__)
static __int64 mmRounder = 0x0002000200020002ll; __int64 mmRounder = 0x0002000200020002ll;
#else #else
static __int64 mmRounder = 0x0002000200020002; __int64 mmRounder = 0x0002000200020002;
#endif #endif
static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidth, PIX pixHeight, BOOL bBilinear) static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidth, PIX pixHeight, BOOL bBilinear)
@ -305,19 +299,19 @@ pixLoopN:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // Save GCC's register.
"movl %%ecx, %%ebx \n\t"
"pxor %%mm0, %%mm0 \n\t" "pxor %%mm0, %%mm0 \n\t"
"movl %[pulSrcMipmap], %%esi \n\t"
"movl %[pulDstMipmap], %%edi \n\t"
"movl %[pixHeight], %%edx \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"movl %%ebx, %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"1: \n\t" // pixLoopN "1: \n\t" // pixLoopN
"movd 0(%%esi), %%mm1 \n\t" // up-left "movd 0(%%esi), %%mm1 \n\t" // up-left
"movd 4(%%esi), %%mm2 \n\t" // up-right "movd 4(%%esi), %%mm2 \n\t" // up-right
"movd 0(%%esi, %%ebx, 8), %%mm3 \n\t" // down-left "movd 0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left
"movd 4(%%esi, %%ebx, 8), %%mm4 \n\t" // down-right "movd 4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t" "punpcklbw %%mm0, %%mm2 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t" "punpcklbw %%mm0, %%mm3 \n\t"
@ -325,7 +319,7 @@ pixLoopN:
"paddw %%mm2, %%mm1 \n\t" "paddw %%mm2, %%mm1 \n\t"
"paddw %%mm3, %%mm1 \n\t" "paddw %%mm3, %%mm1 \n\t"
"paddw %%mm4, %%mm1 \n\t" "paddw %%mm4, %%mm1 \n\t"
"paddw (%%eax), %%mm1 \n\t" "paddw (" ASMSYM(mmRounder) "), %%mm1 \n\t"
"psrlw $2, %%mm1 \n\t" "psrlw $2, %%mm1 \n\t"
"packuswb %%mm0, %%mm1 \n\t" "packuswb %%mm0, %%mm1 \n\t"
"movd %%mm1, (%%edi) \n\t" "movd %%mm1, (%%edi) \n\t"
@ -338,15 +332,17 @@ pixLoopN:
// advance to next row // advance to next row
// skip one row in source mip-map // skip one row in source mip-map
"leal 0(%%esi, %%ebx, 8), %%esi \n\t" "leal 0(%%esi, %[pixWidth], 8), %%esi \n\t"
"decl %%edx \n\t" "decl %%edx \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"popl %%ebx \n\t" // restore GCC's register.
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "a" (&mmRounder), "c" (pixWidth), "S" (pulSrcMipmap), : [pixWidth] "r" (pixWidth),
"D" (pulDstMipmap), "d" (pixHeight) [pulSrcMipmap] "g" (pulSrcMipmap),
: "cc", "memory" [pulDstMipmap] "g" (pulDstMipmap),
[pixHeight] "g" (pixHeight)
: FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -433,23 +429,22 @@ fullEnd:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp, tmp2;
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // Save GCC's register. "xorl %[xbx], %[xbx] \n\t"
"movl %%ecx, %%ebx \n\t" "movl %[pulSrcMipmap], %%esi \n\t"
"movl %[pulDstMipmap], %%edi \n\t"
// setup upper half // setup upper half
"pushl %%edx \n\t" // pixHeight "movl %[pixHeight], %%eax \n\t"
"pushl %%eax \n\t" // ulRowModulo "movl %%eax, %[xdx] \n\t"
"pushl %%ebx \n\t" // pixWidth "shrl $1, %[xdx] \n\t"
"xorl %%ebx, %%ebx \n\t"
"shrl $1, %%edx \n\t"
"0: \n\t" // halfLoop "0: \n\t" // halfLoop
"movl (%%esp), %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"shrl $1, %%ecx \n\t" "shrl $1, %%ecx \n\t"
"1: \n\t" // leftLoop "1: \n\t" // leftLoop
"movl 0(%%esi, %%ebx, 8), %%eax \n\t" // upper-left (or lower-left) "movl 0(%%esi, %[xbx], 8), %%eax \n\t" // upper-left (or lower-left)
"movl %%eax, (%%edi) \n\t" "movl %%eax, (%%edi) \n\t"
// advance to next pixel // advance to next pixel
@ -459,12 +454,12 @@ fullEnd:
"jg 1b \n\t" // leftLoop "jg 1b \n\t" // leftLoop
// do right row half // do right row half
"movl (%%esp), %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"shrl $1, %%ecx \n\t" "shrl $1, %%ecx \n\t"
"jz 3f \n\t" // halfEnd "jz 3f \n\t" // halfEnd
"2: \n\t" // rightLoop "2: \n\t" // rightLoop
"movl 4(%%esi, %%ebx, 8), %%eax \n\t" // upper-right (or lower-right) "movl 4(%%esi, %[xbx], 8), %%eax \n\t" // upper-right (or lower-right)
"movl %%eax, (%%edi) \n\t" "movl %%eax, (%%edi) \n\t"
// advance to next pixel // advance to next pixel
@ -475,25 +470,26 @@ fullEnd:
"3: \n\t" // halfEnd "3: \n\t" // halfEnd
// advance to next row // advance to next row
"addl 4(%%esp), %%esi \n\t" // skip one row in source mip-map "addl %[ulRowModulo], %%esi \n\t" // skip one row in source mip-map
"subl $1, %%edx \n\t" "subl $1, %[xdx] \n\t"
"jg 0b \n\t" // halfLoop "jg 0b \n\t" // halfLoop
// do eventual lower half loop (if not yet done) // do eventual lower half loop (if not yet done)
"movl 8(%%esp), %%edx \n\t" "movl %[pixHeight], %%eax \n\t"
"shrl $1, %%edx \n\t" "movl %%eax, %[xdx] \n\t"
"shrl $1, %[xdx] \n\t"
"jz 4f \n\t" // fullEnd "jz 4f \n\t" // fullEnd
"cmpl (%%esp), %%ebx \n\t" "cmpl %[pixWidth], %[xbx] \n\t"
"movl (%%esp), %%ebx \n\t" "movl %[pixWidth], %[xbx] \n\t"
"jne 0b \n\t" // halfLoop "jne 0b \n\t" // halfLoop
"4: \n\t" // fullEnd "4: \n\t" // fullEnd
"addl $12, %%esp \n\t" : [xbx] "=&r" (tmp), [xdx] "=&g" (tmp2)
"popl %%ebx \n\t" // restore GCC's register. : [pulSrcMipmap] "g" (pulSrcMipmap),
: // no outputs. [pulDstMipmap] "g" (pulDstMipmap),
: "S" (pulSrcMipmap), "D" (pulDstMipmap), "d" (pixHeight), [pixHeight] "g" (pixHeight), [pixWidth] "g" (pixWidth),
"c" (pixWidth), "a" (ulRowModulo) [ulRowModulo] "g" (ulRowModulo)
: "cc", "memory" : "eax", "ecx", "esi", "edi", "cc", "memory"
); );
#else #else
@ -663,9 +659,6 @@ static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd)
#endif #endif
// performs dithering of a 32-bit bipmap (can be in-place) // performs dithering of a 32-bit bipmap (can be in-place)
#ifdef __GNUC__
__attribute__((noinline)) // because of asm labels
#endif
void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight, void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight,
PIX pixCanvasWidth, PIX pixCanvasHeight) PIX pixCanvasWidth, PIX pixCanvasHeight)
{ {
@ -860,26 +853,27 @@ nextRowO:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp;
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[pulSrc], %%esi \n\t"
"movl %[pulDst], %%edi \n\t"
// reset dither line offset // reset dither line offset
"pushl %%ebx \n\t" // save GCC's register. "movl %[pixHeight], %%eax \n\t"
"movl (" ASMSYM(pulDitherTable) "), %%ebx \n\t" "movl %%eax, %[xdx] \n\t"
"pushl %%ecx \n\t" // slModulo
"pushl %%eax \n\t" // pixWidth
"xorl %%eax, %%eax \n\t" "xorl %%eax, %%eax \n\t"
"rowLoopO: \n\t" "0: \n\t" // rowLoopO
// get horizontal dither patterns // get horizontal dither patterns
"movq 0(%%ebx, %%eax, 4), %%mm4 \n\t" "movq 0(%[pulDitherTable], %%eax, 4), %%mm4 \n\t"
"movq 8(%%ebx, %%eax, 4), %%mm5 \n\t" "movq 8(%[pulDitherTable], %%eax, 4), %%mm5 \n\t"
"psrlw (" ASMSYM(mmShifter) "), %%mm4 \n\t" "psrlw (" ASMSYM(mmShifter) "), %%mm4 \n\t"
"psrlw (" ASMSYM(mmShifter) "), %%mm5 \n\t" "psrlw (" ASMSYM(mmShifter) "), %%mm5 \n\t"
"pand (" ASMSYM(mmMask) "), %%mm4 \n\t" "pand (" ASMSYM(mmMask) "), %%mm4 \n\t"
"pand (" ASMSYM(mmMask) "), %%mm5 \n\t" "pand (" ASMSYM(mmMask) "), %%mm5 \n\t"
// process row // process row
"movl (%%esp), %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"pixLoopO: \n\t" "1: \n\t" // pixLoopO
"movq 0(%%esi), %%mm1 \n\t" "movq 0(%%esi), %%mm1 \n\t"
"movq 8(%%esi), %%mm2 \n\t" "movq 8(%%esi), %%mm2 \n\t"
"paddusb %%mm4, %%mm1 \n\t" "paddusb %%mm4, %%mm1 \n\t"
@ -891,30 +885,30 @@ nextRowO:
"addl $16, %%esi \n\t" "addl $16, %%esi \n\t"
"addl $16, %%edi \n\t" "addl $16, %%edi \n\t"
"subl $4, %%ecx \n\t" "subl $4, %%ecx \n\t"
"jg pixLoopO \n\t" // !!!! possible memory leak? "jg 1b \n\t" // !!!! possible memory leak?
"je nextRowO \n\t" "je 2f \n\t" // nextRowO
// backup couple of pixels // backup couple of pixels
"leal 0(%%esi, %%ecx, 4), %%esi \n\t" "leal 0(%%esi, %%ecx, 4), %%esi \n\t"
"leal 0(%%edi, %%ecx, 4), %%edi \n\t" "leal 0(%%edi, %%ecx, 4), %%edi \n\t"
"nextRowO: \n\t" "2: \n\t" // nextRowO
// get next dither line patterns // get next dither line patterns
"addl 4(%%esp), %%esi \n\t" "addl %[slModulo], %%esi \n\t"
"addl 4(%%esp), %%edi \n\t" "addl %[slModulo], %%edi \n\t"
"addl $4, %%eax \n\t" "addl $4, %%eax \n\t"
"andl $15, %%eax \n\t" "andl $15, %%eax \n\t"
// advance to next row // advance to next row
"decl %%edx \n\t" "decl %[xdx] \n\t"
"jnz rowLoopO \n\t" "jnz 0b \n\t" // rowLoopO
"emms \n\t" "emms \n\t"
"addl $8, %%esp \n\t" : [xdx] "=&g" (tmp)
"popl %%ebx \n\t" // restore GCC's register. : [pulSrc] "g" (pulSrc), [pulDst] "g" (pulDst),
: // no outputs. [pixHeight] "g" (pixHeight), [pixWidth] "g" (pixWidth),
: "S" (pulSrc), "D" (pulDst), "d" (pixHeight), [slModulo] "g" (slModulo), [pulDitherTable] "r" (pulDitherTable)
"a" (pixWidth), "c" (slModulo) : FPU_REGS, MMX_REGS, "eax", "ecx", "esi", "edi",
: "cc", "memory" "cc", "memory"
); );
#else #else
@ -1054,17 +1048,17 @@ allDoneE:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // Save GCC's register.
"movl %%ecx, %%ebx \n\t"
"pxor %%mm0, %%mm0 \n\t" "pxor %%mm0, %%mm0 \n\t"
"movl %[pulDst], %%esi \n\t"
"movl %[pixHeight], %%edx \n\t"
"decl %%edx \n\t" // need not to dither last row "decl %%edx \n\t" // need not to dither last row
"rowLoopE: \n\t" "0: \n\t" // rowLoopE
// left to right // left to right
"movl %%eax, %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"pixLoopEL: \n\t" "1: \n\t" // pixLoopEL
"movd (%%esi), %%mm1 \n\t" "movd (%%esi), %%mm1 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t" "pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t"
@ -1089,29 +1083,29 @@ allDoneE:
// spread errors // spread errors
"paddusb 4(%%esi), %%mm7 \n\t" "paddusb 4(%%esi), %%mm7 \n\t"
"paddusb -4(%%esi, %%ebx, 4), %%mm3 \n\t" "paddusb -4(%%esi, %[pixCanvasWidth], 4), %%mm3 \n\t"
"paddusb 0(%%esi, %%ebx, 4), %%mm5 \n\t" "paddusb 0(%%esi, %[pixCanvasWidth], 4), %%mm5 \n\t"
"paddusb 4(%%esi, %%ebx, 4), %%mm1 \n\t" // !!!! possible memory leak? "paddusb 4(%%esi, %[pixCanvasWidth], 4), %%mm1 \n\t" // !!!! possible memory leak?
"movd %%mm7, 4(%%esi) \n\t" "movd %%mm7, 4(%%esi) \n\t"
"movd %%mm3, -4(%%esi, %%ebx, 4) \n\t" "movd %%mm3, -4(%%esi, %[pixCanvasWidth], 4) \n\t"
"movd %%mm5, 0(%%esi, %%ebx, 4) \n\t" "movd %%mm5, 0(%%esi, %[pixCanvasWidth], 4) \n\t"
"movd %%mm1, 4(%%esi, %%ebx, 4) \n\t" "movd %%mm1, 4(%%esi, %[pixCanvasWidth], 4) \n\t"
// advance to next pixel // advance to next pixel
"addl $4, %%esi \n\t" "addl $4, %%esi \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz pixLoopEL \n\t" "jnz 1b \n\t" // pixLoopEL
// advance to next row // advance to next row
"addl %%edi, %%esi \n\t" "addl %[slWidthModulo], %%esi \n\t"
"decl %%edx \n\t" "decl %%edx \n\t"
"jz allDoneE \n\t" "jz 3f \n\t" // allDoneE
// right to left // right to left
"movl %%eax, %%ecx \n\t" "movl %[pixWidth], %%ecx \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"pixLoopER: \n\t" "2: \n\t" // pixLoopER
"movd (%%esi), %%mm1 \n\t" "movd (%%esi), %%mm1 \n\t"
"punpcklbw %%mm0, %%mm1 \n\t" "punpcklbw %%mm0, %%mm1 \n\t"
"pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t" "pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t"
@ -1136,30 +1130,30 @@ allDoneE:
// spread errors // spread errors
"paddusb -4(%%esi), %%mm7 \n\t" "paddusb -4(%%esi), %%mm7 \n\t"
"paddusb -4(%%esi, %%ebx, 4), %%mm1 \n\t" "paddusb -4(%%esi, %[pixCanvasWidth], 4), %%mm1 \n\t"
"paddusb 0(%%esi, %%ebx, 4), %%mm5 \n\t" "paddusb 0(%%esi, %[pixCanvasWidth], 4), %%mm5 \n\t"
"paddusb 4(%%esi, %%ebx, 4), %%mm3 \n\t" // !!!! possible memory leak? "paddusb 4(%%esi, %[pixCanvasWidth], 4), %%mm3 \n\t" // !!!! possible memory leak?
"movd %%mm7, -4(%%esi) \n\t" "movd %%mm7, -4(%%esi) \n\t"
"movd %%mm1, -4(%%esi, %%ebx, 4) \n\t" "movd %%mm1, -4(%%esi, %[pixCanvasWidth], 4) \n\t"
"movd %%mm5, 0(%%esi, %%ebx, 4) \n\t" "movd %%mm5, 0(%%esi, %[pixCanvasWidth], 4) \n\t"
"movd %%mm3, 4(%%esi, %%ebx, 4) \n\t" "movd %%mm3, 4(%%esi, %[pixCanvasWidth], 4) \n\t"
// revert to previous pixel // revert to previous pixel
"subl $4, %%esi \n\t" "subl $4, %%esi \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz pixLoopER \n\t" "jnz 2b \n\t" // pixLoopER
// advance to next row // advance to next row
"leal 0(%%esi, %%ebx, 4), %%esi \n\t" "leal 0(%%esi, %[pixCanvasWidth], 4), %%esi \n\t"
"decl %%edx \n\t" "decl %%edx \n\t"
"jnz rowLoopE \n\t" "jnz 0b \n\t" // rowLoopE
"allDoneE: \n\t" "3: \n\t" // allDoneE
"popl %%ebx \n\t"
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "S" (pulDst), "c" (pixCanvasWidth), "d" (pixHeight), "a" (pixWidth), : [pulDst] "g" (pulDst), [pixCanvasWidth] "r" (pixCanvasWidth),
"D" (slWidthModulo) [pixHeight] "g" (pixHeight), [pixWidth] "g" (pixWidth),
: "cc", "memory" [slWidthModulo] "g" (slWidthModulo)
: FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "cc", "memory"
); );
#else #else
@ -1271,7 +1265,7 @@ extern "C" {
} }
#if USE_PORTABLE_C #ifdef USE_PORTABLE_C
typedef SWORD ExtPix[4]; typedef SWORD ExtPix[4];
static inline void extpix_fromi64(ExtPix &pix, const __int64 i64) static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
@ -2538,7 +2532,8 @@ lowerLoop:
"popl %%ebx \n\t" "popl %%ebx \n\t"
: // no outputs. : // no outputs.
: // inputs are all globals. : // inputs are all globals.
: "eax", "ecx", "edx", "edi", "esi", "cc", "memory" : FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else

View File

@ -1363,6 +1363,13 @@ pixLoop:
_pixBaseWidth_renderWater = pixBaseWidth; _pixBaseWidth_renderWater = pixBaseWidth;
__asm__ __volatile__ ( __asm__ __volatile__ (
// this sucks :(
"movl %[pixBaseHeight], %%eax \n\t"
"movl %[pswHeightMap], %%ecx \n\t"
"movl %[pulTexture], %%edx \n\t"
"movl %[pulTextureBase], %%esi \n\t"
"movl %[slHeightRowStep], %%edi \n\t"
"pushl %%ebx \n\t" // GCC needs this. "pushl %%ebx \n\t" // GCC needs this.
"movl (" ASMSYM(_pixBaseWidth_renderWater) "),%%ebx \n\t" "movl (" ASMSYM(_pixBaseWidth_renderWater) "),%%ebx \n\t"
@ -1444,9 +1451,13 @@ pixLoop:
"popl %%ebx \n\t" // restore GCC's register. "popl %%ebx \n\t" // restore GCC's register.
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "a" (pixBaseHeight), "c" (pswHeightMap), : [pixBaseHeight] "g" (pixBaseHeight),
"d" (pulTexture), "S" (pulTextureBase), "D" (slHeightRowStep) [pswHeightMap] "g" (pswHeightMap),
: "cc", "memory" [pulTexture] "g" (pulTexture),
[pulTextureBase] "g" (pulTextureBase),
[slHeightRowStep] "g" (slHeightRowStep)
: FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -1617,9 +1628,7 @@ pixLoop2:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // GCC's register. "bsfl %[pixBaseWidth], %%eax \n\t"
"movl %%ecx, %%ebx \n\t"
"bsfl %%eax, %%eax \n\t" // pixBaseWidth
"movl $32, %%edx \n\t" "movl $32, %%edx \n\t"
"subl %%eax, %%edx \n\t" "subl %%eax, %%edx \n\t"
"movl %%edx, (" ASMSYM(mmBaseWidthShift) ") \n\t" "movl %%edx, (" ASMSYM(mmBaseWidthShift) ") \n\t"
@ -1631,11 +1640,11 @@ pixLoop2:
"pxor %%mm6, %%mm6 \n\t" // MM6 = pixV|pixU "pxor %%mm6, %%mm6 \n\t" // MM6 = pixV|pixU
// (These registers were loaded here in the original version...) "movl %[pswHeightMap], %%edx \n\t"
//"movl (pswHeightMap), %%ebx \n\t" "movl %[pulTextureBase], %%esi \n\t"
//"movl (pulTextureBase), %%esi \n\t" "movl %[pulTexture], %%edi \n\t"
//"movl (pulTexture), %%edi \n\t" "pushl %%ebx \n\t" // GCC's register.
"movl %%edx, %%ebx \n\t"
"movl (" ASMSYM(_pixBufferHeight) "), %%edx \n\t" "movl (" ASMSYM(_pixBufferHeight) "), %%edx \n\t"
"0: \n\t" // rowLoop2 "0: \n\t" // rowLoop2
@ -1753,9 +1762,12 @@ pixLoop2:
"popl %%ebx \n\t" // GCC's value. "popl %%ebx \n\t" // GCC's value.
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "a" (pixBaseWidth), "c" (pswHeightMap), : [pixBaseWidth] "g" (pixBaseWidth),
"S" (pulTextureBase), "D" (pulTexture) [pswHeightMap] "g" (pswHeightMap),
: "edx", "cc", "memory" [pulTextureBase] "g" (pulTextureBase),
[pulTexture] "g" (pulTexture)
: FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -2136,9 +2148,7 @@ pixLoop4:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t" // GCC's register. "bsfl %[pixBaseWidth], %%eax \n\t"
"movl %%ecx, %%ebx \n\t"
"bsfl %%eax, %%eax \n\t"
"movl $32, %%edx \n\t" "movl $32, %%edx \n\t"
"subl %%eax, %%edx \n\t" "subl %%eax, %%edx \n\t"
"movl %%edx, (" ASMSYM(mmBaseWidthShift) ") \n\t" "movl %%edx, (" ASMSYM(mmBaseWidthShift) ") \n\t"
@ -2150,11 +2160,11 @@ pixLoop4:
"pxor %%mm6, %%mm6 \n\t" // MM6 = pixV|pixU "pxor %%mm6, %%mm6 \n\t" // MM6 = pixV|pixU
// (These registers were loaded here in the original version...) "movl %[pswHeightMap], %%edx \n\t"
//"movl (pswHeightMap), %%ebx \n\t" "movl %[pulTextureBase], %%esi \n\t"
//"movl (pulTextureBase), %%esi \n\t" "movl %[pulTexture], %%edi \n\t"
//"movl (pulTexture), %%edi \n\t" "pushl %%ebx \n\t" // GCC's register.
"movl %%edx, %%ebx \n\t"
"movl (" ASMSYM(_pixBufferHeight) "), %%edx \n\t" "movl (" ASMSYM(_pixBufferHeight) "), %%edx \n\t"
"0: \n\t" // rowLoop4 "0: \n\t" // rowLoop4
"pushl %%edx \n\t" "pushl %%edx \n\t"
@ -2485,9 +2495,12 @@ pixLoop4:
"popl %%ebx \n\t" // Restore GCC's value. "popl %%ebx \n\t" // Restore GCC's value.
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "a" (pixBaseWidth), "c" (pswHeightMap), : [pixBaseWidth] "g" (pixBaseWidth),
"S" (pulTextureBase), "D" (pulTexture) [pswHeightMap] "g" (pswHeightMap),
: "edx", "cc", "memory" [pulTextureBase] "g" (pulTextureBase),
[pulTexture] "g" (pulTexture)
: FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
@ -2965,6 +2978,11 @@ pixDone:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[slColumnModulo], %%edx \n\t"
"movl %[slBufferMask], %%ecx \n\t"
"movl %[slDensity], %%eax \n\t"
"movl (" ASMSYM(ulRNDSeed) "), %%edi \n\t"
"pushl %%ebx \n\t" // GCC's register. "pushl %%ebx \n\t" // GCC's register.
"xorl %%ebx, %%ebx \n\t" "xorl %%ebx, %%ebx \n\t"
"pushl %%edx \n\t" // slColumnModulo "pushl %%edx \n\t" // slColumnModulo
@ -2977,7 +2995,7 @@ pixDone:
"1: \n\t" // rowLoopFM "1: \n\t" // rowLoopFM
"movl (" ASMSYM(_pixBufferWidth) "), %%edx \n\t" "movl (" ASMSYM(_pixBufferWidth) "), %%edx \n\t"
"addl %%esi, %%edx \n\t" "addl %[pubNew], %%edx \n\t"
"movzbl (%%ebx, %%edx), %%eax \n\t" "movzbl (%%ebx, %%edx), %%eax \n\t"
"addl (" ASMSYM(_pixBufferWidth) "), %%edx \n\t" "addl (" ASMSYM(_pixBufferWidth) "), %%edx \n\t"
"movzbl (%%ebx, %%edx), %%edx \n\t" "movzbl (%%ebx, %%edx), %%edx \n\t"
@ -2985,7 +3003,7 @@ pixDone:
"shrl $1, %%eax \n\t" "shrl $1, %%eax \n\t"
"cmpl (%%esp), %%eax \n\t" "cmpl (%%esp), %%eax \n\t"
"jg doCalc_animateFire \n\t" "jg doCalc_animateFire \n\t"
"movb $0, (%%esi, %%ebx) \n\t" "movb $0, (%[pubNew], %%ebx) \n\t"
"jmp pixDone_animateFire \n\t" "jmp pixDone_animateFire \n\t"
"doCalc_animateFire: \n\t" "doCalc_animateFire: \n\t"
@ -2996,7 +3014,7 @@ pixDone:
"movsbl " ASMSYM(asbMod3Sub1Table) "(%%edx), %%edx \n\t" "movsbl " ASMSYM(asbMod3Sub1Table) "(%%edx), %%edx \n\t"
"addl %%ebx, %%edx \n\t" "addl %%ebx, %%edx \n\t"
"andl 4(%%esp), %%edx \n\t" // slBufferMask "andl 4(%%esp), %%edx \n\t" // slBufferMask
"movb %%al, (%%esi, %%edx) \n\t" "movb %%al, (%[pubNew], %%edx) \n\t"
"imull $262147, %%edi \n\t" "imull $262147, %%edi \n\t"
"pixDone_animateFire: \n\t" "pixDone_animateFire: \n\t"
@ -3015,9 +3033,10 @@ pixDone:
"addl $12, %%esp \n\t" // lose our locals. "addl $12, %%esp \n\t" // lose our locals.
"popl %%ebx \n\t" // Restore GCC's var. "popl %%ebx \n\t" // Restore GCC's var.
: // no outputs. : // no outputs.
: "a" (slDensity), "c" (slBufferMask), : [slBufferMask] "g" (slBufferMask),
"d" (slColumnModulo), "D" (ulRNDSeed), "S" (pubNew) [slColumnModulo] "g" (slColumnModulo),
: "cc", "memory" [pubNew] "r" (pubNew), [slDensity] "g" (slDensity)
: "eax", "ecx", "edx", "edi", "cc", "memory"
); );
#else #else
@ -3103,6 +3122,12 @@ pixLoopF:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
_pubHeat_RenderPlasmaFire = pubHeat; // ran out of registers. :/ _pubHeat_RenderPlasmaFire = pubHeat; // ran out of registers. :/
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[slHeatRowStep], %%eax \n\t"
"movl %[slHeatMapStep], %%edx \n\t"
"movl %[slBaseMipShift], %%ecx \n\t"
"movl %[pulTextureBase], %%esi \n\t"
"movl %[pulTexture], %%edi \n\t"
"pushl %%ebx \n\t" "pushl %%ebx \n\t"
"movl (" ASMSYM(_pubHeat_RenderPlasmaFire) "),%%ebx \n\t" "movl (" ASMSYM(_pubHeat_RenderPlasmaFire) "),%%ebx \n\t"
"pushl %%eax \n\t" // slHeatRowStep "pushl %%eax \n\t" // slHeatRowStep
@ -3131,9 +3156,12 @@ pixLoopF:
"addl $12, %%esp \n\t" // lose our locals. "addl $12, %%esp \n\t" // lose our locals.
"popl %%ebx \n\t" // restore GCC's register. "popl %%ebx \n\t" // restore GCC's register.
: // no outputs. : // no outputs.
: "S" (pulTextureBase), "D" (pulTexture), : [pulTextureBase] "g" (pulTextureBase),
"c" (slBaseMipShift), "a" (slHeatRowStep), "d" (slHeatMapStep) [pulTexture] "g" (pulTexture),
: "cc", "memory" [slBaseMipShift] "g" (slBaseMipShift),
[slHeatRowStep] "g" (slHeatRowStep),
[slHeatMapStep] "g" (slHeatMapStep)
: "eax", "ecx", "edx", "esi", "edi", "cc", "memory"
); );
#else #else

View File

@ -365,9 +365,9 @@ skipPixel:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp1, tmp2;
__asm__ __volatile__ ( __asm__ __volatile__ (
// prepare interpolants // prepare interpolants
"pushl %%ebx \n\t"
"movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t" "movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t"
"movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t" "movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t"
"psllq $32, %%mm1 \n\t" "psllq $32, %%mm1 \n\t"
@ -378,26 +378,25 @@ skipPixel:
"por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV "por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV
// prepare color // prepare color
"pxor %%mm0, %%mm0 \n\t" "pxor %%mm0, %%mm0 \n\t"
"movd %%eax, %%mm7 \n\t" "movd %[ulLightRGB], %%mm7 \n\t"
"punpcklbw %%mm0, %%mm7 \n\t" "punpcklbw %%mm0, %%mm7 \n\t"
"psllw $1, %%mm7 \n\t" "psllw $1, %%mm7 \n\t"
// loop thru rows // loop thru rows
"movl (" ASMSYM(_pulLayer) "), %%edi \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movl (" ASMSYM(_iRowCt) "), %%ebx \n\t" "movl (" ASMSYM(_iRowCt) "), %[xbx] \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"pushl %%ebx \n\t" "movd %%mm1, %[slL2Point] \n\t"
"movd %%mm1, %%ebx \n\t" // EBX = slL2Point
"movq %%mm1, %%mm3 \n\t" "movq %%mm1, %%mm3 \n\t"
"psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU "psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU
// loop thru pixels in current row // loop thru pixels in current row
"movl (" ASMSYM(_iPixCt) "), %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"1: \n\t" // pixLoop "1: \n\t" // pixLoop
// check if pixel need to be drawn // check if pixel need to be drawn
"cmpl $0x10000000, %%ebx \n\t" "cmpl $0x10000000, %[slL2Point] \n\t"
"jge 3f \n\t" // skipPixel "jge 3f \n\t" // skipPixel
// calculate intensities and do actual drawing of shadow pixel ARGB // calculate intensities and do actual drawing of shadow pixel ARGB
"movd %%ecx, %%mm4 \n\t" "movd %%ecx, %%mm4 \n\t"
"movl %%ebx, %%eax \n\t" "movl %[slL2Point], %%eax \n\t"
"sarl $15, %%eax \n\t" "sarl $15, %%eax \n\t"
"andl $8191, %%eax \n\t" "andl $8191, %%eax \n\t"
"movzbl " ASMSYM(aubSqrt) "(%%eax), %%eax \n\t" "movzbl " ASMSYM(aubSqrt) "(%%eax), %%eax \n\t"
@ -424,22 +423,20 @@ skipPixel:
// advance to next pixel // advance to next pixel
"addl $4, %%edi \n\t" "addl $4, %%edi \n\t"
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%eax \n\t"
"addl %%eax, %%ebx \n\t" "addl %%eax, %[slL2Point] \n\t"
"paddd (" ASMSYM(mmDDL2oDU_AddAmbientPoint) "), %%mm3 \n\t" "paddd (" ASMSYM(mmDDL2oDU_AddAmbientPoint) "), %%mm3 \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 1b \n\t" // pixLoop "jnz 1b \n\t" // pixLoop
// advance to the next row // advance to the next row
"popl %%ebx \n\t"
"addl (" ASMSYM(_slModulo) "), %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"paddd %%mm2, %%mm1 \n\t" "paddd %%mm2, %%mm1 \n\t"
"paddd (" ASMSYM(mmDDL2oDV_AddAmbientPoint) "), %%mm2 \n\t" "paddd (" ASMSYM(mmDDL2oDV_AddAmbientPoint) "), %%mm2 \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"popl %%ebx \n\t"
"emms \n\t" "emms \n\t"
: // no outputs. : [xbx] "=&r" (tmp1), [slL2Point] "=&g" (tmp2)
: "a" (ulLightRGB) : [ulLightRGB] "g" (ulLightRGB)
: "ecx", "edx", "edi", "esi", "cc", "memory" : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
); );
#else #else
@ -580,10 +577,9 @@ skipPixel:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp1, tmp2;
__asm__ __volatile__ ( __asm__ __volatile__ (
// prepare interpolants // prepare interpolants
"pushl %%ebx \n\t"
"movl %%ecx, %%ebx \n\t"
"movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t" "movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t"
"movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t" "movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t"
"psllq $32, %%mm1 \n\t" "psllq $32, %%mm1 \n\t"
@ -594,29 +590,30 @@ skipPixel:
"por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV "por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV
// prepare color // prepare color
"pxor %%mm0, %%mm0 \n\t" // MM0 = 0 | 0 (for unpacking purposes) "pxor %%mm0, %%mm0 \n\t" // MM0 = 0 | 0 (for unpacking purposes)
"movd %%eax, %%mm7 \n\t" // eax == ulLightRGB "movd %[ulLightRGB], %%mm7 \n\t"
"punpcklbw %%mm0, %%mm7 \n\t" "punpcklbw %%mm0, %%mm7 \n\t"
"psllw $1, %%mm7 \n\t" "psllw $1, %%mm7 \n\t"
// loop thru rows // loop thru rows
"movl %[pubMask], %%esi \n\t"
"movl (" ASMSYM(_pulLayer) "), %%edi \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movzbl (%%ebx), %%edx \n\t" // ebx == &ubMask "movzbl %[ubMask], %%edx \n\t"
"movl (" ASMSYM(_iRowCt) "), %%ebx \n\t" "movl (" ASMSYM(_iRowCt) "), %%eax \n\t"
"movl %%eax, %[xbx] \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"pushl %%ebx \n\t" "movd %%mm1, %[slL2Point] \n\t"
"movd %%mm1, %%ebx \n\t" // EBX = slL2Point
"movq %%mm1, %%mm3 \n\t" "movq %%mm1, %%mm3 \n\t"
"psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU "psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU
// loop thru pixels in current row // loop thru pixels in current row
"movl (" ASMSYM(_iPixCt) "), %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"1: \n\t" // pixLoop "1: \n\t" // pixLoop
// check if pixel need to be drawn; i.e. draw if( [esi] & ubMask && (slL2Point<FTOX)) // check if pixel need to be drawn; i.e. draw if( [esi] & ubMask && (slL2Point<FTOX))
"cmpl $0x10000000, %%ebx \n\t" "cmpl $0x10000000, %[slL2Point] \n\t"
"jge 3f \n\t" // skipPixel "jge 3f \n\t" // skipPixel
"testb (%%esi), %%dl \n\t" "testb (%%esi), %%dl \n\t"
"je 3f \n\t" // skipPixel "je 3f \n\t" // skipPixel
// calculate intensities and do actual drawing of shadow pixel ARGB // calculate intensities and do actual drawing of shadow pixel ARGB
"movd %%ecx, %%mm4 \n\t" "movd %%ecx, %%mm4 \n\t"
"movl %%ebx, %%eax \n\t" "movl %[slL2Point], %%eax \n\t"
"sarl $15, %%eax \n\t" "sarl $15, %%eax \n\t"
"andl $8191, %%eax \n\t" "andl $8191, %%eax \n\t"
"movzbl " ASMSYM(aubSqrt) "(%%eax), %%eax \n\t" "movzbl " ASMSYM(aubSqrt) "(%%eax), %%eax \n\t"
@ -643,24 +640,24 @@ skipPixel:
// advance to next pixel // advance to next pixel
"addl $4, %%edi \n\t" "addl $4, %%edi \n\t"
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%eax \n\t"
"addl %%eax, %%ebx \n\t" "addl %%eax, %[slL2Point] \n\t"
"paddd (" ASMSYM(mmDDL2oDU_addAmbientMaskPoint) "), %%mm3 \n\t" "paddd (" ASMSYM(mmDDL2oDU_addAmbientMaskPoint) "), %%mm3 \n\t"
"rolb $1, %%dl \n\t" "rolb $1, %%dl \n\t"
"adcl $0, %%esi \n\t" "adcl $0, %%esi \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 1b \n\t" // pixLoop "jnz 1b \n\t" // pixLoop
// advance to the next row // advance to the next row
"popl %%ebx \n\t"
"addl (" ASMSYM(_slModulo) "), %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"paddd %%mm2, %%mm1 \n\t" "paddd %%mm2, %%mm1 \n\t"
"paddd (" ASMSYM(mmDDL2oDV_addAmbientMaskPoint) "), %%mm2 \n\t" "paddd (" ASMSYM(mmDDL2oDV_addAmbientMaskPoint) "), %%mm2 \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"popl %%ebx \n\t"
"emms \n\t" "emms \n\t"
: // no outputs. : [xbx] "=&g" (tmp1), [slL2Point] "=&g" (tmp2)
: "a" (ulLightRGB), "S" (pubMask), "c" (&ubMask) : [ulLightRGB] "g" (ulLightRGB), [pubMask] "g" (pubMask),
: "edx", "edi", "cc", "memory" [ubMask] "m" (ubMask)
: FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -800,10 +797,8 @@ skipPixel:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp1, tmp2;
__asm__ __volatile__ ( __asm__ __volatile__ (
"pushl %%ebx \n\t"
"movl %%ecx, %%ebx \n\t"
"pushl %%ebx \n\t"
// prepare interpolants // prepare interpolants
"movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t" "movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t"
"movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t" "movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t"
@ -816,31 +811,30 @@ skipPixel:
// prepare color // prepare color
"pxor %%mm0, %%mm0 \n\t" "pxor %%mm0, %%mm0 \n\t"
"movd %%eax, %%mm7 \n\t" "movd %[ulLightRGB], %%mm7 \n\t"
"punpcklbw %%mm0, %%mm7 \n\t" "punpcklbw %%mm0, %%mm7 \n\t"
"psllw $1, %%mm7 \n\t" "psllw $1, %%mm7 \n\t"
// loop thru rows // loop thru rows
"movl (" ASMSYM(_pulLayer) "), %%edi \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movl (" ASMSYM(_iRowCt) "), %%ebx \n\t" "movl (" ASMSYM(_iRowCt) "), %[xbx] \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"pushl %%ebx \n\t" "movd %%mm1, %[slL2Point] \n\t"
"movd %%mm1, %%ebx \n\t" // EBX = slL2Point
"movq %%mm1, %%mm3 \n\t" "movq %%mm1, %%mm3 \n\t"
"psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU "psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU
// loop thru pixels in current row // loop thru pixels in current row
"movl (" ASMSYM(_iPixCt) "), %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"1: \n\t" // pixLoop "1: \n\t" // pixLoop
// check if pixel need to be drawn // check if pixel need to be drawn
"cmpl $0x10000000, %%ebx \n\t" "cmpl $0x10000000, %[slL2Point] \n\t"
"jge 3f \n\t" // skipPixel "jge 3f \n\t" // skipPixel
// calculate intensities and do actual drawing of shadow pixel ARGB // calculate intensities and do actual drawing of shadow pixel ARGB
"movd %%ecx, %%mm4 \n\t" "movd %%ecx, %%mm4 \n\t"
"movl %%ebx, %%eax \n\t" "movl %[slL2Point], %%eax \n\t"
"sarl $15, %%eax \n\t" "sarl $15, %%eax \n\t"
"andl $8191, %%eax \n\t" "andl $8191, %%eax \n\t"
"movzwl " ASMSYM(auw1oSqrt) "(, %%eax, 2), %%eax \n\t" "movzwl " ASMSYM(auw1oSqrt) "(, %%eax, 2), %%eax \n\t"
"movl (" ASMSYM(_slLightMax) "), %%ecx \n\t" "movl (" ASMSYM(_slLightMax) "), %%ecx \n\t"
"cmpl 4(%%esp), %%eax \n\t" "cmpl %[slMax1oL], %%eax \n\t"
"jge 2f \n\t" // skipInterpolation "jge 2f \n\t" // skipInterpolation
"leal -256(%%eax), %%ecx \n\t" "leal -256(%%eax), %%ecx \n\t"
"imull (" ASMSYM(_slLightStep) "), %%ecx \n\t" "imull (" ASMSYM(_slLightStep) "), %%ecx \n\t"
@ -861,23 +855,20 @@ skipPixel:
// advance to next pixel // advance to next pixel
"addl $4, %%edi \n\t" "addl $4, %%edi \n\t"
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%eax \n\t"
"addl %%eax, %%ebx \n\t" "addl %%eax, %[slL2Point] \n\t"
"paddd (" ASMSYM(mmDDL2oDU_AddDiffusionPoint) "), %%mm3 \n\t" "paddd (" ASMSYM(mmDDL2oDU_AddDiffusionPoint) "), %%mm3 \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 1b \n\t" // pixLoop "jnz 1b \n\t" // pixLoop
// advance to the next row // advance to the next row
"popl %%ebx \n\t"
"addl (" ASMSYM(_slModulo) "), %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"paddd %%mm2, %%mm1 \n\t" "paddd %%mm2, %%mm1 \n\t"
"paddd (" ASMSYM(mmDDL2oDV_AddDiffusionPoint) "), %%mm2 \n\t" "paddd (" ASMSYM(mmDDL2oDV_AddDiffusionPoint) "), %%mm2 \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"addl $4, %%esp \n\t"
"popl %%ebx \n\t"
"emms \n\t" "emms \n\t"
: // no outputs. : [xbx] "=&r" (tmp1), [slL2Point] "=&g" (tmp2)
: "a" (ulLightRGB), "c" (slMax1oL) : [ulLightRGB] "g" (ulLightRGB), [slMax1oL] "g" (slMax1oL)
: "edx", "edi", "esi", "cc", "memory" : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
); );
#else #else
@ -1018,9 +1009,9 @@ skipPixel:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp1, tmp2;
__asm__ __volatile__ ( __asm__ __volatile__ (
// prepare interpolants // prepare interpolants
"pushl %%edx \n\t" // slMax1oL
"movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t" "movd (" ASMSYM(_slL2Row) "), %%mm0 \n\t"
"movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t" "movd (" ASMSYM(_slDL2oDURow) "), %%mm1 \n\t"
"psllq $32, %%mm1 \n\t" "psllq $32, %%mm1 \n\t"
@ -1031,34 +1022,35 @@ skipPixel:
"por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV "por %%mm0, %%mm2 \n\t" // MM2 = slDDL2oDUoDV | slDL2oDV
// prepare color // prepare color
"pxor %%mm0, %%mm0 \n\t" // MM0 = 0 | 0 (for unpacking purposes) "pxor %%mm0, %%mm0 \n\t" // MM0 = 0 | 0 (for unpacking purposes)
"movd %%eax, %%mm7 \n\t" // eax == ulLightRGB "movd %[ulLightRGB], %%mm7 \n\t"
"punpcklbw %%mm0, %%mm7 \n\t" "punpcklbw %%mm0, %%mm7 \n\t"
"psllw $1, %%mm7 \n\t" "psllw $1, %%mm7 \n\t"
// loop thru rows // loop thru rows
"movl %[pubMask], %%esi \n\t"
"movl (" ASMSYM(_pulLayer) "), %%edi \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movzbl (%%ecx), %%edx \n\t" // ecx == &ubMask "movzbl %[ubMask], %%edx \n\t"
"movl (" ASMSYM(_iRowCt) "), %%ebx \n\t" "movl (" ASMSYM(_iRowCt) "), %%eax \n\t"
"movl %%eax, %[xbx] \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"pushl %%ebx \n\t" "movd %%mm1, %[slL2Point] \n\t"
"movd %%mm1, %%ebx \n\t" // EBX = slL2Point
"movq %%mm1, %%mm3 \n\t" "movq %%mm1, %%mm3 \n\t"
"psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU "psrlq $32, %%mm3 \n\t" // MM3 = 0 | slDL2oDU
// loop thru pixels in current row // loop thru pixels in current row
"movl (" ASMSYM(_iPixCt) "), %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"1: \n\t" // pixLoop "1: \n\t" // pixLoop
// check if pixel need to be drawn; i.e. draw if( [esi] & ubMask && (slL2Point<FTOX)) // check if pixel need to be drawn; i.e. draw if( [esi] & ubMask && (slL2Point<FTOX))
"cmpl $0x10000000, %%ebx \n\t" "cmpl $0x10000000, %[slL2Point] \n\t"
"jge 3f \n\t" // skipPixel "jge 3f \n\t" // skipPixel
"testb (%%esi), %%dl \n\t" "testb (%%esi), %%dl \n\t"
"je 3f \n\t" // skipPixel "je 3f \n\t" // skipPixel
// calculate intensities and do actual drawing of shadow pixel ARGB // calculate intensities and do actual drawing of shadow pixel ARGB
"movd %%ecx, %%mm4 \n\t" "movd %%ecx, %%mm4 \n\t"
"movl %%ebx, %%eax \n\t" "movl %[slL2Point], %%eax \n\t"
"sarl $15, %%eax \n\t" "sarl $15, %%eax \n\t"
"andl $8191, %%eax \n\t" "andl $8191, %%eax \n\t"
"movzwl " ASMSYM(auw1oSqrt) "(, %%eax, 2), %%eax \n\t" "movzwl " ASMSYM(auw1oSqrt) "(, %%eax, 2), %%eax \n\t"
"movl (" ASMSYM(_slLightMax) "), %%ecx \n\t" "movl (" ASMSYM(_slLightMax) "), %%ecx \n\t"
"cmpl 4(%%esp), %%eax \n\t" // slMax1oL "cmpl %[slMax1oL], %%eax \n\t"
"jge 2f \n\t" // skipInterpolation "jge 2f \n\t" // skipInterpolation
"leal -256(%%eax), %%ecx \n\t" "leal -256(%%eax), %%ecx \n\t"
"imull (" ASMSYM(_slLightStep) "), %%ecx \n\t" "imull (" ASMSYM(_slLightStep) "), %%ecx \n\t"
@ -1079,24 +1071,24 @@ skipPixel:
// advance to next pixel // advance to next pixel
"addl $4, %%edi \n\t" "addl $4, %%edi \n\t"
"movd %%mm3, %%eax \n\t" "movd %%mm3, %%eax \n\t"
"addl %%eax, %%ebx \n\t" "addl %%eax, %[slL2Point] \n\t"
"paddd (" ASMSYM(mmDDL2oDU_AddDiffusionMaskPoint) "), %%mm3 \n\t" "paddd (" ASMSYM(mmDDL2oDU_AddDiffusionMaskPoint) "), %%mm3 \n\t"
"rolb $1, %%dl \n\t" "rolb $1, %%dl \n\t"
"adcl $0, %%esi \n\t" "adcl $0, %%esi \n\t"
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 1b \n\t" // pixLoop "jnz 1b \n\t" // pixLoop
// advance to the next row // advance to the next row
"popl %%ebx \n\t"
"addl (" ASMSYM(_slModulo) "), %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"paddd %%mm2, %%mm1 \n\t" "paddd %%mm2, %%mm1 \n\t"
"paddd (" ASMSYM(mmDDL2oDV_AddDiffusionMaskPoint) "), %%mm2 \n\t" "paddd (" ASMSYM(mmDDL2oDV_AddDiffusionMaskPoint) "), %%mm2 \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"addl $4, %%esp \n\t" // ditch our temporaries.
"emms \n\t" "emms \n\t"
: // no outputs. : [xbx] "=&g" (tmp1), [slL2Point] "=&g" (tmp2)
: "a" (ulLightRGB), "S" (pubMask), "c" (&ubMask), "d" (slMax1oL) : [ulLightRGB] "g" (ulLightRGB), [pubMask] "g" (pubMask),
: "cc", "memory" [ubMask] "m" (ubMask), [slMax1oL] "g" (slMax1oL)
: FPU_REGS, MMX_REGS, "eax", "ecx", "edx", "esi", "edi",
"cc", "memory"
); );
#else #else
@ -1574,16 +1566,16 @@ rowNext:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp;
__asm__ __volatile__ ( __asm__ __volatile__ (
// prepare pointers and variables // prepare pointers and variables
"pushl %%ebx \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movl %%ecx,%%ebx \n\t" "movl (" ASMSYM(_iRowCt) "), %[xbx] \n\t"
"movd %[ulLight], %%mm6 \n\t"
"movd (%%eax), %%mm6 \n\t"
"punpckldq %%mm6, %%mm6 \n\t" "punpckldq %%mm6, %%mm6 \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"movl %%edx, %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"shrl $1, %%ecx \n\t" "shrl $1, %%ecx \n\t"
"jz 2f \n\t" // pixRest "jz 2f \n\t" // pixRest
@ -1598,7 +1590,7 @@ rowNext:
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 1b \n\t" // pixLoop "jnz 1b \n\t" // pixLoop
"2: \n\t" // pixRest "2: \n\t" // pixRest
"testl $1, %%edx \n\t" "testl $1, (" ASMSYM(_iPixCt) ") \n\t"
"jz 3f \n\t" // rowNext "jz 3f \n\t" // rowNext
"movd (%%edi), %%mm5 \n\t" "movd (%%edi), %%mm5 \n\t"
"paddusb %%mm6, %%mm5 \n\t" "paddusb %%mm6, %%mm5 \n\t"
@ -1607,15 +1599,13 @@ rowNext:
"3: \n\t" // rowNext "3: \n\t" // rowNext
// advance to the next row // advance to the next row
"addl %%esi, %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"popl %%ebx \n\t"
"emms \n\t" "emms \n\t"
: // no outputs. : [xbx] "=&r" (tmp)
: "S" (_slModulo), "D" (_pulLayer), "a" (&ulLight), "c" (_iRowCt), : [ulLight] "g" (ulLight)
"d" (_iPixCt) : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory"
: "cc", "memory"
); );
#else #else
@ -1676,16 +1666,17 @@ skipLight:
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG tmp;
__asm__ __volatile__ ( __asm__ __volatile__ (
// prepare pointers and variables // prepare pointers and variables
"pushl %%ebx \n\t" // save GCC's register. "movzbl %[ubMask], %%edx \n\t"
"movl (" ASMSYM(_iRowCt) "), %%ebx \n\t" "movl %[pubMask], %%esi \n\t"
"pushl %%ecx \n\t" "movl (" ASMSYM(_pulLayer) "), %%edi \n\t"
"movzbl (%%edx), %%edx \n\t" "movl (" ASMSYM(_iRowCt) "), %[xbx]\n\t"
"movd (%%eax), %%mm6 \n\t" "movd %[ulLight], %%mm6 \n\t"
"0: \n\t" // rowLoop "0: \n\t" // rowLoop
"movl (%%esp), %%ecx \n\t" "movl (" ASMSYM(_iPixCt) "), %%ecx \n\t"
"1: \n\t" // pixLoop "1: \n\t" // pixLoop
// mix underlaying pixels with the constant light color if not shaded // mix underlaying pixels with the constant light color if not shaded
@ -1705,15 +1696,14 @@ skipLight:
// advance to the next row // advance to the next row
"addl (" ASMSYM(_slModulo) "), %%edi \n\t" "addl (" ASMSYM(_slModulo) "), %%edi \n\t"
"decl %%ebx \n\t" "decl %[xbx] \n\t"
"jnz 0b \n\t" // rowLoop "jnz 0b \n\t" // rowLoop
"emms \n\t" "emms \n\t"
"popl %%ebx \n\t" // lose _iPixCt we pushed. : [xbx] "=&r" (tmp)
"popl %%ebx \n\t" // restore GCC's register. : [ubMask] "m" (ubMask), [pubMask] "g" (pubMask),
: [ulLight] "g" (ulLight)
: "d" (&ubMask), "S" (pubMask), "D" (_pulLayer), : FPU_REGS, "mm5", "mm6", "ecx", "edx", "esi", "edi",
"a" (&ulLight), "c" (_iPixCt) "cc", "memory"
: "cc", "memory"
); );
#else #else
@ -1873,13 +1863,14 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG clob1, clob2, clob3;
__asm__ __volatile__ ( __asm__ __volatile__ (
"cld \n\t" "cld \n\t"
"imull %%esi, %%ecx \n\t" "imull %%esi, %%ecx \n\t"
"bswapl %%eax \n\t" "bswapl %%eax \n\t"
"rep \n\t" "rep \n\t"
"stosl \n\t" "stosl \n\t"
: // no outputs. : "=a" (clob1), "=c" (clob2), "=D" (clob3)
: "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV), : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
"a" (colAmbient), "D" (this->lm_pulShadowMap) "a" (colAmbient), "D" (this->lm_pulShadowMap)
: "cc", "memory" : "cc", "memory"
@ -1977,12 +1968,13 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
rep movsd rep movsd
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG clob1, clob2, clob3;
__asm__ __volatile__ ( __asm__ __volatile__ (
"cld \n\t" "cld \n\t"
"imull %%eax, %%ecx \n\t" "imull %%eax, %%ecx \n\t"
"rep \n\t" "rep \n\t"
"movsl \n\t" "movsl \n\t"
: // no outputs. : "=c" (clob1), "=S" (clob2), "=D" (clob3)
: "c" (this->lm_pixCanvasSizeU), "a" (this->lm_pixCanvasSizeV), : "c" (this->lm_pixCanvasSizeU), "a" (this->lm_pixCanvasSizeV),
"S" (this->lm_pulStaticShadowMap), "D" (this->lm_pulShadowMap) "S" (this->lm_pulStaticShadowMap), "D" (this->lm_pulShadowMap)
: "cc", "memory" : "cc", "memory"
@ -2015,13 +2007,14 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
ULONG clob1, clob2, clob3;
__asm__ __volatile__ ( __asm__ __volatile__ (
"cld \n\t" "cld \n\t"
"imull %%edx, %%ecx \n\t" "imull %%edx, %%ecx \n\t"
"bswapl %%eax \n\t" // convert to R,G,B,A memory format! "bswapl %%eax \n\t" // convert to R,G,B,A memory format!
"rep \n\t" "rep \n\t"
"stosl \n\t" "stosl \n\t"
: // no outputs. : "=a" (clob1), "=c" (clob2), "=D" (clob3)
: "c" (this->lm_pixCanvasSizeU), "d" (this->lm_pixCanvasSizeV), : "c" (this->lm_pixCanvasSizeU), "d" (this->lm_pixCanvasSizeV),
"a" (col), "D" (this->lm_pulShadowMap) "a" (col), "D" (this->lm_pulShadowMap)
: "cc", "memory" : "cc", "memory"

View File

@ -125,6 +125,7 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
PIX pixRet; PIX pixRet;
SLONG clobber;
__asm__ __volatile__ ( __asm__ __volatile__ (
"flds (%%eax) \n\t" "flds (%%eax) \n\t"
"fistl (%%edx) \n\t" "fistl (%%edx) \n\t"
@ -134,7 +135,7 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
"movl (%%ecx), %%edx \n\t" "movl (%%ecx), %%edx \n\t"
"addl $0x7FFFFFFF, %%edx \n\t" "addl $0x7FFFFFFF, %%edx \n\t"
"adcl $0, %%eax \n\t" "adcl $0, %%eax \n\t"
: "=a" (pixRet) : "=a" (pixRet), "=d" (clobber)
: "a" (&f), "d" (&slTmp), "c" (&fDiff) : "a" (&f), "d" (&slTmp), "c" (&fDiff)
: "cc", "memory" : "cc", "memory"
); );

View File

@ -96,11 +96,12 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
// !!! FIXME : rcg12172001 Is this REALLY any faster than memset()? // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()?
ULONG clob1, clob2;
__asm__ __volatile__ ( __asm__ __volatile__ (
"cld \n\t" "cld \n\t"
"rep \n\t" "rep \n\t"
"stosl \n\t" "stosl \n\t"
: // no outputs. : "=D" (clob1), "=c" (clob2)
: "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2) : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2)
: "cc", "memory" : "cc", "memory"
); );
@ -132,11 +133,12 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
} }
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
// !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()? // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()?
ULONG clob1, clob2, clob3;
__asm__ __volatile__ ( __asm__ __volatile__ (
"cld \n\t" "cld \n\t"
"rep \n\t" "rep \n\t"
"movsl \n\t" "movsl \n\t"
: // no outputs. : "=S" (clob1), "=D" (clob2), "=c" (clob3)
: "S" (((char *)pvMixerBuffer) + slSrcOffset), : "S" (((char *)pvMixerBuffer) + slSrcOffset),
"D" (pDstBuffer), "D" (pDstBuffer),
"c" (slBytes >> 2) "c" (slBytes >> 2)
@ -184,6 +186,9 @@ copyLoop:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[pvMixerBuffer], %%esi \n\t"
"movl %[pDstBuffer], %%edi \n\t"
"movl %[slDW], %%ecx \n\t"
"0: \n\t" // copyLoop "0: \n\t" // copyLoop
"movzwl (%%esi), %%eax \n\t" "movzwl (%%esi), %%eax \n\t"
"movw %%ax, (%%edi) \n\t" "movw %%ax, (%%edi) \n\t"
@ -192,10 +197,10 @@ copyLoop:
"decl %%ecx \n\t" "decl %%ecx \n\t"
"jnz 0b \n\t" // copyLoop "jnz 0b \n\t" // copyLoop
: // no outputs. : // no outputs.
: "S" (((char *)pvMixerBuffer) + slSrcOffset), : [pvMixerBuffer] "g" (((char *)pvMixerBuffer) + slSrcOffset),
"D" (pDstBuffer), [pDstBuffer] "g" (pDstBuffer),
"c" (slBytes >> 2) [slDW] "g" (slBytes >> 2)
: "cc", "memory", "eax" : "eax", "ecx", "esi", "edi", "cc", "memory"
); );
#else #else
@ -247,6 +252,9 @@ copyLoop:
#elif (defined __GNU_INLINE__) #elif (defined __GNU_INLINE__)
__asm__ __volatile__ ( __asm__ __volatile__ (
"movl %[pvMixerBuffer], %%esi \n\t"
"movl %[pvMixerBuffer], %%edi \n\t"
"movl %[slDW], %%ecx \n\t"
"cld \n\t" "cld \n\t"
"0: \n\t" // copyLoop "0: \n\t" // copyLoop
"movq (%%esi), %%mm0 \n\t" "movq (%%esi), %%mm0 \n\t"
@ -258,8 +266,8 @@ copyLoop:
"jnz 0b \n\t" // copyLoop "jnz 0b \n\t" // copyLoop
"emms \n\t" "emms \n\t"
: // no outputs. : // no outputs.
: "S" (pvMixerBuffer), "D" (pvMixerBuffer), "c" (slBytes >> 2) : [pvMixerBuffer] "g" (pvMixerBuffer), [slDW] "g" (slBytes >> 2)
: "cc", "memory" : FPU_REGS, "mm0", "ecx", "esi", "edi", "cc", "memory"
); );
#else #else

View File

@ -96,6 +96,8 @@ SEGMENT .text
global MixMono_asm global MixMono_asm
MixMono_asm: MixMono_asm:
push ebx ; Save GCC register. push ebx ; Save GCC register.
push esi
push edi
; convert from floats to fixints 32:16 ; convert from floats to fixints 32:16
fld D [fLeftOfs] fld D [fLeftOfs]
fmul D [f65536] fmul D [f65536]
@ -224,6 +226,8 @@ loopEnd_MixMono:
shr edx,16 shr edx,16
mov D [slLastLeftSample],eax mov D [slLastLeftSample],eax
mov D [slLastRightSample],edx mov D [slLastRightSample],edx
pop edi
pop esi
pop ebx ; Restore GCC register. pop ebx ; Restore GCC register.
emms emms
ret ret
@ -232,6 +236,8 @@ loopEnd_MixMono:
global MixStereo_asm global MixStereo_asm
MixStereo_asm: MixStereo_asm:
push ebx ; Save GCC register. push ebx ; Save GCC register.
push esi
push edi
; convert from floats to fixints 32:16 ; convert from floats to fixints 32:16
fld D [fLeftOfs] fld D [fLeftOfs]
fmul D [f65536] fmul D [f65536]
@ -363,6 +369,8 @@ loopEnd_MixStereo:
mov D [slLastLeftSample],eax mov D [slLastLeftSample],eax
mov D [slLastRightSample],edx mov D [slLastRightSample],edx
emms emms
pop edi
pop esi
pop ebx ; Restore GCC register. pop ebx ; Restore GCC register.
ret ret