diff --git a/Sources/CMakeLists.txt b/Sources/CMakeLists.txt index c43ced8..39ab5f7 100644 --- a/Sources/CMakeLists.txt +++ b/Sources/CMakeLists.txt @@ -188,13 +188,20 @@ else() set(DEBUGSUFFIX "") endif() -# This should not be needed anymore, but might be faster on 32bit x86 -option(USE_I386_ASM "Use X86 ASM" FALSE) +option(USE_ASM "Use ASM code" TRUE) +if (USE_ASM) + MESSAGE(STATUS "Using assembler code (when available)") +else() + add_definitions(-DUSE_PORTABLE_C=1) + MESSAGE(STATUS "Using portable C instead of all ASM") +endif() -if (USE_I386_ASM) +option(USE_I386_NASM_ASM "Use i386 nasm ASM code" FALSE) + +if (USE_ASM AND USE_I386_NASM_ASM) # You need the Netwide Assembler (NASM) to build this on Intel systems. # http://nasm.sf.net/ - add_definitions(-DUSE_I386_ASM=1) + add_definitions(-DUSE_I386_NASM_ASM=1) if (MACOSX) set(ASMOBJFMT "macho") list(APPEND ASMFLAGS --prefix _) @@ -203,10 +210,9 @@ if (USE_I386_ASM) else() set(ASMOBJFMT "elf") endif() - MESSAGE(STATUS "Using i386 assembler") + MESSAGE(STATUS "Using i386 nasm ASM") else() - add_definitions(-DUSE_PORTABLE_C=1) - MESSAGE(STATUS "Using portable C instead of ASM") + MESSAGE(STATUS "Not using i386 nasm ASM") endif() option(PANDORA "Compile for Pandora" FALSE) @@ -655,7 +661,7 @@ add_dependencies(${SHADERSLIB} ParseEntities) add_parser_and_scanner("Engine/Base/Parser" "Engine/Base/Scanner") add_parser_and_scanner("Engine/Ska/smcPars" "Engine/Ska/smcScan") -if (USE_I386_ASM) +if (USE_I386_NASM_ASM) add_custom_command( OUTPUT "SoundMixer386.o" MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/Engine/Sound/SoundMixer386.asm" diff --git a/Sources/Engine/Base/Base.h b/Sources/Engine/Base/Base.h index fd032f3..12151f1 100644 --- a/Sources/Engine/Base/Base.h +++ b/Sources/Engine/Base/Base.h @@ -65,9 +65,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #else #warning "UNKNOWN PLATFORM IDENTIFIED!!!!" #define PLATFORM_UNKNOWN 1 - #warning "USING PORTABLE C!!!" - #define USE_PORTABLE_C -#endif +#endif #if PLATFORM_LINUX || PLATFORM_MACOSX #ifndef PLATFORM_UNIX diff --git a/Sources/Engine/Base/Profiling.cpp b/Sources/Engine/Base/Profiling.cpp index bd23ba2..87bd3b7 100644 --- a/Sources/Engine/Base/Profiling.cpp +++ b/Sources/Engine/Base/Profiling.cpp @@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc., template class CStaticArray; template class CStaticArray; -#if (defined USE_PORTABLE_C) +#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__) #include #endif static inline __int64 ReadTSC_profile(void) { -#if (defined USE_PORTABLE_C) - #ifdef __arm__ - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) ); - #else - struct timeval tv; - gettimeofday(&tv, NULL); - return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) ); - #endif - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __int64 mmRet; __asm { rdtsc @@ -47,7 +36,7 @@ static inline __int64 ReadTSC_profile(void) } return mmRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __int64 mmRet; __asm__ __volatile__ ( "rdtsc \n\t" @@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void) return(mmRet); #else - #error Please implement for your platform/compiler. + #ifdef __arm__ + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) ); + #else + struct timeval tv; + gettimeofday(&tv, NULL); + return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) ); + #endif + #endif } diff --git a/Sources/Engine/Base/Timer.cpp b/Sources/Engine/Base/Timer.cpp index 0bfd970..8c02846 100755 --- a/Sources/Engine/Base/Timer.cpp +++ b/Sources/Engine/Base/Timer.cpp @@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #include // !!! FIXME: use SDL timer code instead and rdtsc never? -#if (USE_PORTABLE_C) +#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__) #define USE_GETTIMEOFDAY 1 #endif @@ -64,7 +64,7 @@ static inline __int64 ReadTSC(void) } return mmRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __int64 mmRet; __asm__ __volatile__ ( "rdtsc \n\t" diff --git a/Sources/Engine/Base/Types.h b/Sources/Engine/Base/Types.h index 84de328..555bc81 100644 --- a/Sources/Engine/Base/Types.h +++ b/Sources/Engine/Base/Types.h @@ -109,6 +109,30 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); #define ASMSYM(x) #x #endif +/* should we enable inline asm? */ +#ifndef USE_PORTABLE_C + #if defined(__MSVC_INLINE__) + /* the build system selected __MSVC_INLINE__ */ + #elif defined(__GNU_INLINE_X86_32__) + /* the build system selected __GNU_INLINE_X86_32__ */ + #elif defined(_MSC_VER) && defined(_M_IX86) + #define __MSVC_INLINE__ + #elif defined (__GNUC__) && defined(__i386) + #define __GNU_INLINE_X86_32__ + #elif defined (__GNUC__) && defined(__x86_64__) + #define __GNU_INLINE_X86_64__ + #endif + + #if defined(__GNU_INLINE_X86_32__) || defined(__GNU_INLINE_X86_64__) + #define __GNU_INLINE_X86__ + #endif + + #if defined(__GNU_INLINE_X86__) + #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" + #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" + #endif +#endif + #ifdef PLATFORM_UNIX /* rcg10042001 */ #include #include @@ -134,25 +158,6 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); #endif #endif - #if ((defined __GNUC__) && (!defined __GNU_INLINE__)) - #define __GNU_INLINE__ - #endif - - #if (defined __INTEL_COMPILER) - #if ((!defined __GNU_INLINE__) && (!defined __MSVC_INLINE__)) - #error Please define __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++. - #endif - - #if ((defined __GNU_INLINE__) && (defined __MSVC_INLINE__)) - #error Define either __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++. - #endif - #endif - - #if defined(__GNU_INLINE__) && defined(__i386__) - #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)" - #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7" - #endif - #ifndef PAGESIZE #define PAGESIZE 4096 #endif @@ -230,10 +235,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); inline ULONG _rotl(ULONG ul, int bits) { - #if (defined USE_PORTABLE_C) - // DG: according to http://blog.regehr.org/archives/1063 this is fast - return (ul<>(-bits&31)); - #elif (defined __GNU_INLINE__) + #if (defined __GNU_INLINE_X86_32__) // This, on the other hand, is wicked fast. :) __asm__ __volatile__ ( "roll %%cl, %%eax \n\t" @@ -255,7 +257,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); return(ul); #else - #error need inline asm for your platform. + // DG: according to http://blog.regehr.org/archives/1063 this is fast + return (ul<>(-bits&31)); #endif } diff --git a/Sources/Engine/Engine.cpp b/Sources/Engine/Engine.cpp index 15021ca..452d281 100644 --- a/Sources/Engine/Engine.cpp +++ b/Sources/Engine/Engine.cpp @@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReser static void DetectCPU(void) { -#if (defined USE_PORTABLE_C) // rcg10072001 - CPrintF(TRANSV(" (No CPU detection in this binary.)\n")); - -#else - char strVendor[12+1]; + char strVendor[12+1] = { 0 }; strVendor[12] = 0; - ULONG ulTFMS; - ULONG ulFeatures; + ULONG ulTFMS = 0; + ULONG ulFeatures = 0; #if (defined __MSVC_INLINE__) // test MMX presence and update flag @@ -148,43 +144,47 @@ static void DetectCPU(void) mov dword ptr [ulFeatures], edx } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86__) + ULONG eax, ebx, ecx, edx; // test MMX presence and update flag __asm__ __volatile__ ( - "pushl %%ebx \n\t" - "xorl %%eax,%%eax \n\t" // request for basic id + #if (defined __GNU_INLINE_X86_64__) "cpuid \n\t" - "movl %%ebx, (%%esi) \n\t" - "movl %%edx, 4(%%esi) \n\t" - "movl %%ecx, 8(%%esi) \n\t" - "popl %%ebx \n\t" - : // no specific outputs. - : "S" (strVendor) - : "eax", "ecx", "edx", "memory" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + #else + "movl %%ebx, %%esi \n\t" + "cpuid \n\t" + "xchgl %%ebx, %%esi \n\t" + : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) + #endif + : "a" (0) // request for basic id ); - - // need to break this into a separate asm block, since I'm clobbering - // too many registers. There's something to be said for letting MSVC - // figure out where on the stack your locals are resting, but yeah, - // I know, that's x86-specific anyhow... - // !!! FIXME: can probably do this right with modern GCC. + memcpy(strVendor + 0, &ebx, 4); + memcpy(strVendor + 4, &edx, 4); + memcpy(strVendor + 8, &ecx, 4); __asm__ __volatile__ ( - "pushl %%ebx \n\t" - "movl $1, %%eax \n\t" // request for TFMS feature flags - "cpuid \n\t" - "mov %%eax, (%%esi) \n\t" // remember type, family, model and stepping - "mov %%edx, (%%edi) \n\t" - "popl %%ebx \n\t" - : // no specific outputs. - : "S" (&ulTFMS), "D" (&ulFeatures) - : "eax", "ecx", "edx", "memory" + #if (defined __GNU_INLINE_X86_64__) + "cpuid \n\t" + : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx) + #else + "movl %%ebx, %%esi \n\t" + "cpuid \n\t" + "xchgl %%ebx, %%esi \n\t" + : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx) + #endif + : "a" (1) // request for TFMS feature flags ); + ulTFMS = eax; + ulFeatures = edx; - #else - #error Please implement for your platform or define USE_PORTABLE_C. #endif + if (ulTFMS == 0) { + CPrintF(TRANSV(" (No CPU detection in this binary.)\n")); + return; + } + INDEX iType = (ulTFMS>>12)&0x3; INDEX iFamily = (ulTFMS>> 8)&0xF; INDEX iModel = (ulTFMS>> 4)&0xF; @@ -215,8 +215,6 @@ static void DetectCPU(void) sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6); if( !bMMX) FatalError( TRANS("MMX support required but not present!")); - -#endif // defined USE_PORTABLE_C } static void DetectCPUWrapper(void) diff --git a/Sources/Engine/Graphics/Color.cpp b/Sources/Engine/Graphics/Color.cpp index 50e1bf6..6a6519c 100644 --- a/Sources/Engine/Graphics/Color.cpp +++ b/Sources/Engine/Graphics/Color.cpp @@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2) if( col2==0xFFFFFFFF) return col1; if( col1==0 || col2==0) return 0; -#if (defined USE_PORTABLE_C) - // !!! FIXME: This...is not fast. - union - { - COLOR col; - UBYTE bytes[4]; - } conv1; - - union - { - COLOR col; - UBYTE bytes[4]; - } conv2; - - conv1.col = col1; - conv2.col = col2; - conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255); - conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255); - conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255); - conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255); - - return(conv1.col); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) COLOR colRet; __asm { xor ebx,ebx @@ -347,7 +324,7 @@ COLOR MulColors( COLOR col1, COLOR col2) } return colRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) COLOR colRet; __asm__ __volatile__ ( "pushl %%ebx \n\t" @@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2) return colRet; #else - #error please fill in inline assembly for your platform. -#endif -} - - -// fast color additon function - RES = clamp (1ST + 2ND) -COLOR AddColors( COLOR col1, COLOR col2) -{ - if( col1==0) return col2; - if( col2==0) return col1; - if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF; - COLOR colRet; - -#if (defined USE_PORTABLE_C) // !!! FIXME: This...is not fast. union { @@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2) COLOR col; UBYTE bytes[4]; } conv2; - #define MINVAL(a, b) ((a)>(b))?(b):(a) conv1.col = col1; conv2.col = col2; - conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255); - conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255); - conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255); - conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255); - #undef MINVAL + conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255); + conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255); + conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255); + conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255); - colRet = conv1.col; + return(conv1.col); +#endif +} -#elif (defined __MSVC_INLINE__) + +// fast color additon function - RES = clamp (1ST + 2ND) +COLOR AddColors( COLOR col1, COLOR col2) +{ + if( col1==0) return col2; + if( col2==0) return col1; + if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF; + COLOR colRet; + +#if (defined __MSVC_INLINE__) __asm { xor ebx,ebx mov esi,255 @@ -535,7 +507,7 @@ COLOR AddColors( COLOR col1, COLOR col2) mov D [colRet],ebx } -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp; __asm__ __volatile__ ( // if xbx is "r", gcc runs out of regs in -fPIC + -fno-omit-fp :( @@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2) ); #else - #error please fill in inline assembly for your platform. + // !!! FIXME: This...is not fast. + union + { + COLOR col; + UBYTE bytes[4]; + } conv1; + + union + { + COLOR col; + UBYTE bytes[4]; + } conv2; + #define MINVAL(a, b) ((a)>(b))?(b):(a) + + conv1.col = col1; + conv2.col = col2; + conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255); + conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255); + conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255); + conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255); + #undef MINVAL + + colRet = conv1.col; #endif return colRet; @@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2) // multiple conversion from OpenGL color to DirectX color extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct) { -#if (defined USE_PORTABLE_C) - //#error write me. - for (int i=0; i>16) | ((tmp&0x000000ff)<<16); - } - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,dword ptr [pulSrc] mov edi,dword ptr [pulDst] @@ -678,12 +665,12 @@ colSkip2: mov dword ptr [edi],eax colSkip1: } - -#elif (defined __GNU_INLINE__) - STUBBED("convert to inline asm."); - #else - #error please fill in inline assembly for your platform. + for (int i=0; i>16) | ((tmp&0x000000ff)<<16); + } + #endif } diff --git a/Sources/Engine/Graphics/Color.h b/Sources/Engine/Graphics/Color.h index 4a0318b..de0c666 100644 --- a/Sources/Engine/Graphics/Color.h +++ b/Sources/Engine/Graphics/Color.h @@ -204,19 +204,7 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito __forceinline ULONG ByteSwap( ULONG ul) { /* rcg10052001 Platform-wrappers. */ -#if (defined USE_PORTABLE_C) - ul = ( ((ul << 24) ) | - ((ul << 8) & 0x00FF0000) | - ((ul >> 8) & 0x0000FF00) | - ((ul >> 24) ) ); - - #if (defined PLATFORM_BIGENDIAN) - BYTESWAP(ul); // !!! FIXME: May not be right! - #endif - - return(ul); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [ul] @@ -225,7 +213,7 @@ __forceinline ULONG ByteSwap( ULONG ul) } return ulRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "bswapl %%eax \n\t" : "=a" (ul) @@ -234,16 +222,22 @@ __forceinline ULONG ByteSwap( ULONG ul) return(ul); #else - #error please define for your platform. + ul = ( ((ul << 24) ) | + ((ul << 8) & 0x00FF0000) | + ((ul >> 8) & 0x0000FF00) | + ((ul >> 24) ) ); + + #if (defined PLATFORM_BIGENDIAN) + BYTESWAP(ul); // !!! FIXME: May not be right! + #endif + + return(ul); #endif } __forceinline ULONG rgba2argb( ULONG ul) { -#if (defined USE_PORTABLE_C) - return( (ul << 24) | (ul >> 8) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [ul] @@ -252,7 +246,7 @@ __forceinline ULONG rgba2argb( ULONG ul) } return ulRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG ulRet; __asm__ __volatile__ ( "rorl $8, %%eax \n\t" @@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul) return ulRet; #else - #error please define for your platform. + return (ul << 24) | (ul >> 8); + #endif } __forceinline ULONG abgr2argb( COLOR col) { -#if (defined USE_PORTABLE_C) - // this could be simplified, this is just a safe conversion from asm code - col = ( ((col << 24) ) | - ((col << 8) & 0x00FF0000) | - ((col >> 8) & 0x0000FF00) | - ((col >> 24) ) ); - return( (col << 24) | (col >> 8) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [col] @@ -287,7 +274,7 @@ __forceinline ULONG abgr2argb( COLOR col) } return ulRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG ulRet; __asm__ __volatile__ ( "bswapl %%eax \n\t" @@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col) return ulRet; #else - #error please define for your platform. + // this could be simplified, this is just a safe conversion from asm code + col = ( ((col << 24) ) | + ((col << 8) & 0x00FF0000) | + ((col >> 8) & 0x0000FF00) | + ((col >> 24) ) ); + return( (col << 24) | (col >> 8) ); + #endif } @@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct); // fast memory copy of ULONGs inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) { -#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - memcpy( pulDst, pulSrc, ctLongs*4); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov esi,dword ptr [pulSrc] @@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) mov ecx,dword ptr [ctLongs] rep movsd } - -#elif (defined __GNU_INLINE__) - // I haven't benchmarked it, but in many cases, memcpy() becomes an - // inline (asm?) macro on GNU platforms, so this might not be a - // speed gain at all over the USE_PORTABLE_C version. - // You Have Been Warned. --ryan. - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "movsd \n\t" - : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs) - : "S" (pulSrc), "D" (pulDst), "c" (ctLongs) - : "cc", "memory" - ); - #else -# error Please fill this in for your platform. + memcpy( pulDst, pulSrc, ctLongs*4); #endif } @@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) // fast memory set of ULONGs inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs) { -#if (defined USE_PORTABLE_C) - for( INDEX i=0; ispo_ctElements; INDEX *piDst = _aiElements.Push(ctElems); -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,D [pspo] mov ecx,D [ctElems] @@ -184,7 +173,7 @@ elemRest: mov D [edi],eax elemDone: } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[ctElems], %%ecx \n\t" "movl %[piDst], %%edi \n\t" @@ -219,11 +208,6 @@ elemDone: "cc", "memory" ); - #else - #error Please write inline ASM for your platform. - - #endif - #else const INDEX iVtx0Pass = pspo->spo_iVtx0Pass; const INDEX *piSrc = pspo->spo_piElements; @@ -495,9 +479,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) // determine maximum used groups ASSERT( _ctGroupsCount); -#if ASMOPT == 1 - - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,2 bsr ecx,D [_ctGroupsCount] @@ -505,7 +487,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) mov D [_ctGroupsCount],eax } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl $2, %%eax \n\t" "bsrl (%%esi), %%ecx \n\t" @@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) : "eax", "ecx", "cc", "memory" ); - #else - #error Please write inline ASM for your platform. - - #endif - #else // emulate x86's bsr opcode...not fast. :/ register DWORD val = _ctGroupsCount; @@ -858,10 +835,7 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn continue; } -// !!! FIXME: rcg11232001 This inline conversion is broken. Use the -// !!! FIXME: rcg11232001 C version for now with GCC. -#if ((ASMOPT == 1) && (!defined __GNU_INLINE__) && (!defined __INTEL_COMPILER)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,D [pspo] mov edi,D [iMappingOffset] @@ -915,7 +889,7 @@ vtxLoop: /* // !!! FIXME: rcg11232001 This inline conversion is broken. Use the // !!! FIXME: rcg11232001 C version for now on Linux. - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) STUBBED("debug this"); __asm__ __volatile__ ( "0: \n\t" // vtxLoop @@ -956,11 +930,6 @@ vtxLoop: ); */ - #else - #error Please write inline ASM for your platform. - - #endif - #else // diffuse mapping diff --git a/Sources/Engine/Graphics/Fog.cpp b/Sources/Engine/Graphics/Fog.cpp index 2d4b259..2945cb4 100644 --- a/Sources/Engine/Graphics/Fog.cpp +++ b/Sources/Engine/Graphics/Fog.cpp @@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ) // need to upload from RGBA format const PIX pixTextureSize = pixSizeI*pixSizeJ; - #if (defined USE_PORTABLE_C) - const UBYTE* src = pubTexture; - DWORD* dst = (DWORD*)(pubTexture+pixTextureSize); - for (int i=0; i> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff ); - src++; - dst++; - } - - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,D [pubTexture] mov edi,D [pubTexture] @@ -95,7 +84,7 @@ pixLoop: jnz pixLoop } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[pubTexture], %%esi \n\t" "movl %[pixTextureSize], %%ecx \n\t" @@ -115,10 +104,18 @@ pixLoop: : "eax", "ecx", "esi", "edi", "cc", "memory" ); - #else - #error Write inline ASM for your platform. +#else + const UBYTE* src = pubTexture; + DWORD* dst = (DWORD*)(pubTexture+pixTextureSize); + for (int i=0; i> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff ); + src++; + dst++; + } - #endif +#endif // determine internal format extern INDEX gap_bAllowGrayTextures; diff --git a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp index d20f55b..4ad76ab 100644 --- a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp +++ b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp @@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV, if( pixSizeV==0) pixSizeV=1; pixSize = pixSizeU*pixSizeV; - #if (defined USE_PORTABLE_C) - // Basically average every other pixel... - UWORD w = 0; - UBYTE *dptr = (UBYTE *) pulDst; - UBYTE *sptr = (UBYTE *) pulSrc; - #if 0 - pixSize *= 4; - for (PIX i = 0; i < pixSize; i++) - { - *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 ); - dptr++; - sptr += 2; - } - #else - for (PIX i = 0; i < pixSize; i++) - { - for (PIX j = 0; j < 4; j++) - { - *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 ); - dptr++; - sptr++; - } - sptr += 4; - } - #endif - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 mov esi,D [pulSrc] @@ -216,7 +191,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV, emms } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "pxor %%mm0,%%mm0 \n\t" "movl %[pulSrc],%%esi \n\t" @@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV, ); #else - #error Please write inline ASM for your platform. + // Basically average every other pixel... + UWORD w = 0; + UBYTE *dptr = (UBYTE *) pulDst; + UBYTE *sptr = (UBYTE *) pulSrc; + #if 0 + pixSize *= 4; + for (PIX i = 0; i < pixSize; i++) + { + *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 ); + dptr++; + sptr += 2; + } + #else + for (PIX i = 0; i < pixSize; i++) + { + for (PIX j = 0; j < 4; j++) + { + *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 ); + dptr++; + sptr++; + } + sptr += 4; + } + #endif #endif // upload mipmap diff --git a/Sources/Engine/Graphics/Graphics.cpp b/Sources/Engine/Graphics/Graphics.cpp index 1373482..3d4be59 100644 --- a/Sources/Engine/Graphics/Graphics.cpp +++ b/Sources/Engine/Graphics/Graphics.cpp @@ -209,7 +209,92 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt if( bBilinear) // type of filtering? { // BILINEAR - #if (defined USE_PORTABLE_C) + #if (defined __MSVC_INLINE__) + __asm { + pxor mm0,mm0 + mov ebx,D [pixWidth] + mov esi,D [pulSrcMipmap] + mov edi,D [pulDstMipmap] + mov edx,D [pixHeight] +rowLoop: + mov ecx,D [pixWidth] +pixLoopN: + movd mm1,D [esi+ 0] // up-left + movd mm2,D [esi+ 4] // up-right + movd mm3,D [esi+ ebx*8 +0] // down-left + movd mm4,D [esi+ ebx*8 +4] // down-right + punpcklbw mm1,mm0 + punpcklbw mm2,mm0 + punpcklbw mm3,mm0 + punpcklbw mm4,mm0 + paddw mm1,mm2 + paddw mm1,mm3 + paddw mm1,mm4 + paddw mm1,Q [mmRounder] + psrlw mm1,2 + packuswb mm1,mm0 + movd D [edi],mm1 + // advance to next pixel + add esi,4*2 + add edi,4 + dec ecx + jnz pixLoopN + // advance to next row + lea esi,[esi+ ebx*8] // skip one row in source mip-map + dec edx + jnz rowLoop + emms + } + + #elif (defined __GNU_INLINE_X86_32__) + __asm__ __volatile__ ( + "pxor %%mm0, %%mm0 \n\t" + "movl %[pulSrcMipmap], %%esi \n\t" + "movl %[pulDstMipmap], %%edi \n\t" + "movl %[pixHeight], %%edx \n\t" + + "0: \n\t" // rowLoop + "movl %[pixWidth], %%ecx \n\t" + + "1: \n\t" // pixLoopN + "movd 0(%%esi), %%mm1 \n\t" // up-left + "movd 4(%%esi), %%mm2 \n\t" // up-right + "movd 0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left + "movd 4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right + "punpcklbw %%mm0, %%mm1 \n\t" + "punpcklbw %%mm0, %%mm2 \n\t" + "punpcklbw %%mm0, %%mm3 \n\t" + "punpcklbw %%mm0, %%mm4 \n\t" + "paddw %%mm2, %%mm1 \n\t" + "paddw %%mm3, %%mm1 \n\t" + "paddw %%mm4, %%mm1 \n\t" + "paddw (" ASMSYM(mmRounder) "), %%mm1 \n\t" + "psrlw $2, %%mm1 \n\t" + "packuswb %%mm0, %%mm1 \n\t" + "movd %%mm1, (%%edi) \n\t" + + // advance to next pixel + "addl $8, %%esi \n\t" + "addl $4, %%edi \n\t" + "decl %%ecx \n\t" + "jnz 1b \n\t" // pixLoopN + + // advance to next row + // skip one row in source mip-map + "leal 0(%%esi, %[pixWidth], 8), %%esi \n\t" + "decl %%edx \n\t" + "jnz 0b \n\t" // rowLoop + "emms \n\t" + : // no outputs. + : [pixWidth] "r" (pixWidth), + [pulSrcMipmap] "g" (pulSrcMipmap), + [pulDstMipmap] "g" (pulDstMipmap), + [pixHeight] "g" (pixHeight) + : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi", + "cc", "memory" + ); + + #else UBYTE *src = (UBYTE *) pulSrcMipmap; UBYTE *dest = (UBYTE *) pulDstMipmap; for (int i = 0 ; i < pixHeight; i++) @@ -260,129 +345,13 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt src += 8*pixWidth; } - #elif (defined __MSVC_INLINE__) - __asm { - pxor mm0,mm0 - mov ebx,D [pixWidth] - mov esi,D [pulSrcMipmap] - mov edi,D [pulDstMipmap] - mov edx,D [pixHeight] -rowLoop: - mov ecx,D [pixWidth] -pixLoopN: - movd mm1,D [esi+ 0] // up-left - movd mm2,D [esi+ 4] // up-right - movd mm3,D [esi+ ebx*8 +0] // down-left - movd mm4,D [esi+ ebx*8 +4] // down-right - punpcklbw mm1,mm0 - punpcklbw mm2,mm0 - punpcklbw mm3,mm0 - punpcklbw mm4,mm0 - paddw mm1,mm2 - paddw mm1,mm3 - paddw mm1,mm4 - paddw mm1,Q [mmRounder] - psrlw mm1,2 - packuswb mm1,mm0 - movd D [edi],mm1 - // advance to next pixel - add esi,4*2 - add edi,4 - dec ecx - jnz pixLoopN - // advance to next row - lea esi,[esi+ ebx*8] // skip one row in source mip-map - dec edx - jnz rowLoop - emms - } - - #elif (defined __GNU_INLINE__) - __asm__ __volatile__ ( - "pxor %%mm0, %%mm0 \n\t" - "movl %[pulSrcMipmap], %%esi \n\t" - "movl %[pulDstMipmap], %%edi \n\t" - "movl %[pixHeight], %%edx \n\t" - - "0: \n\t" // rowLoop - "movl %[pixWidth], %%ecx \n\t" - - "1: \n\t" // pixLoopN - "movd 0(%%esi), %%mm1 \n\t" // up-left - "movd 4(%%esi), %%mm2 \n\t" // up-right - "movd 0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left - "movd 4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right - "punpcklbw %%mm0, %%mm1 \n\t" - "punpcklbw %%mm0, %%mm2 \n\t" - "punpcklbw %%mm0, %%mm3 \n\t" - "punpcklbw %%mm0, %%mm4 \n\t" - "paddw %%mm2, %%mm1 \n\t" - "paddw %%mm3, %%mm1 \n\t" - "paddw %%mm4, %%mm1 \n\t" - "paddw (" ASMSYM(mmRounder) "), %%mm1 \n\t" - "psrlw $2, %%mm1 \n\t" - "packuswb %%mm0, %%mm1 \n\t" - "movd %%mm1, (%%edi) \n\t" - - // advance to next pixel - "addl $8, %%esi \n\t" - "addl $4, %%edi \n\t" - "decl %%ecx \n\t" - "jnz 1b \n\t" // pixLoopN - - // advance to next row - // skip one row in source mip-map - "leal 0(%%esi, %[pixWidth], 8), %%esi \n\t" - "decl %%edx \n\t" - "jnz 0b \n\t" // rowLoop - "emms \n\t" - : // no outputs. - : [pixWidth] "r" (pixWidth), - [pulSrcMipmap] "g" (pulSrcMipmap), - [pulDstMipmap] "g" (pulDstMipmap), - [pixHeight] "g" (pixHeight) - : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi", - "cc", "memory" - ); - - #else - #error Write inline asm for your platform. #endif } else { // NEAREST-NEIGHBOUR but with border preserving ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL; - #if (defined USE_PORTABLE_C) - - PIX offset = 0; - ulRowModulo /= 4; - - for (int q = 0; q < 2; q++) - { - for (PIX i = pixHeight / 2; i > 0; i--) - { - for (PIX j = pixWidth / 2; j > 0; j--) - { - *pulDstMipmap = *(pulSrcMipmap + offset); - pulSrcMipmap += 2; - pulDstMipmap++; - } - - for (PIX j = pixWidth / 2; j > 0; j--) - { - *pulDstMipmap = *(pulSrcMipmap + offset + 1); - pulSrcMipmap += 2; - pulDstMipmap++; - } - - pulSrcMipmap += ulRowModulo; - } - - offset = pixWidth * 2; - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { xor ebx,ebx mov esi,D [pulSrcMipmap] @@ -428,7 +397,7 @@ halfEnd: fullEnd: } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) ULONG tmp, tmp2; __asm__ __volatile__ ( "xorl %[xbx], %[xbx] \n\t" @@ -493,7 +462,33 @@ fullEnd: ); #else - #error Write inline asm for your platform. + PIX offset = 0; + ulRowModulo /= 4; + + for (int q = 0; q < 2; q++) + { + for (PIX i = pixHeight / 2; i > 0; i--) + { + for (PIX j = pixWidth / 2; j > 0; j--) + { + *pulDstMipmap = *(pulSrcMipmap + offset); + pulSrcMipmap += 2; + pulDstMipmap++; + } + + for (PIX j = pixWidth / 2; j > 0; j--) + { + *pulDstMipmap = *(pulSrcMipmap + offset + 1); + pulSrcMipmap += 2; + pulDstMipmap++; + } + + pulSrcMipmap += ulRowModulo; + } + + offset = pixWidth * 2; + } + #endif } } @@ -649,7 +644,7 @@ __int64 mmShifter = 0; __int64 mmMask = 0; ULONG *pulDitherTable; -#ifdef USE_PORTABLE_C +#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__) extern const UBYTE *pubClipByte; // increment a byte without overflowing it static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd) @@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth // ------------------------------- ordered matrix dithering routine ditherOrder: -#if (defined USE_PORTABLE_C) - union uConv - { - ULONG val; - DWORD dwords[2]; - UWORD words[4]; - WORD iwords[4]; - UBYTE bytes[8]; - }; - for (int i=0; i>= mmShifter; } - dith.val &= mmMask; - uConv* src = (uConv*)(pulSrc+i*pixWidth); - uConv* dst = (uConv*)(pulDst+i*pixWidth); - for (int j=0; j>= mmShifter; } + dith.val &= mmMask; + uConv* src = (uConv*)(pulSrc+i*pixWidth); + uConv* dst = (uConv*)(pulDst+i*pixWidth); + for (int j=0; j>4; - p5.words[k] = (p1.words[k]*5)>>4; - p7.words[k] = (p1.words[k]*7)>>4; } - for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);} - for (int k=0; k<4; k++) { - IncrementByteWithClip( src[k + step] , p7.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]); - } - } - } - #endif - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 mov esi,D [pulDst] @@ -1046,7 +1012,7 @@ allDoneE: emms; } -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "pxor %%mm0, %%mm0 \n\t" "movl %[pulDst], %%esi \n\t" @@ -1157,7 +1123,32 @@ allDoneE: ); #else - #error Write inline asm for your platform. + #if 1 //SEB doesn't works.... + for (int i=0; i>4; + p5.words[k] = (p1.words[k]*5)>>4; + p7.words[k] = (p1.words[k]*7)>>4; } + for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);} + for (int k=0; k<4; k++) { + IncrementByteWithClip( src[k + step] , p7.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]); + } + } + } + #endif + #endif goto theEnd; @@ -1265,7 +1256,7 @@ extern "C" { } -#ifdef USE_PORTABLE_C +#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__) typedef SWORD ExtPix[4]; static inline void extpix_fromi64(ExtPix &pix, const __int64 i64) @@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI _mm_empty(); // we're done, clear out the MMX registers! -#elif (defined USE_PORTABLE_C) - slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type - slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type - - ULONG *src = pulSrc; - ULONG *dst = pulDst; - ULONG *rowptr = aulRows; - - ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0}; - #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x); - EXTPIXFROMINT64(mmCm); - EXTPIXFROMINT64(mmCe); - EXTPIXFROMINT64(mmCc); - EXTPIXFROMINT64(mmEch); - EXTPIXFROMINT64(mmEcl); - EXTPIXFROMINT64(mmEe); - EXTPIXFROMINT64(mmEm); - EXTPIXFROMINT64(mmMm); - EXTPIXFROMINT64(mmMe); - EXTPIXFROMINT64(mmMc); - EXTPIXFROMINT64(mmAdd); - EXTPIXFROMINT64(mmInvDiv); - #undef EXTPIXFROMINT64 - - // ----------------------- process upper left corner - extend_pixel(src[0], rmm1); - extend_pixel(src[1], rmm2); - extend_pixel(src[pixCanvasWidth], rmm3); - extend_pixel(src[pixCanvasWidth+1], rmm4); - - extpix_add(rmm2, rmm3); - extpix_mul(rmm1, rmmCm); - extpix_mul(rmm2, rmmCe); - extpix_mul(rmm4, rmmCc); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *(rowptr++) = unextend_pixel(rmm1); - - src++; - - // ----------------------- process upper edge pixels - for (PIX i = pixWidth - 2; i != 0; i--) - { - extend_pixel(src[-1], rmm1); - extend_pixel(src[0], rmm2); - extend_pixel(src[1], rmm3); - extend_pixel(src[pixCanvasWidth-1], rmm4); - extend_pixel(src[pixCanvasWidth], rmm5); - extend_pixel(src[pixCanvasWidth+1], rmm6); - - extpix_add(rmm1, rmm3); - extpix_add(rmm4, rmm6); - extpix_mul(rmm1, rmmEch); - extpix_mul(rmm2, rmmEm); - extpix_mul(rmm4, rmmEcl); - extpix_mul(rmm5, rmmEe); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_add(rmm1, rmm5); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *(rowptr++) = unextend_pixel(rmm1); - src++; - } - - // ----------------------- process upper right corner - - extend_pixel(src[-1], rmm1); - extend_pixel(src[0], rmm2); - extend_pixel(src[pixCanvasWidth-1], rmm3); - extend_pixel(src[pixCanvasWidth], rmm4); - - extpix_add(rmm1, rmm4); - extpix_mul(rmm1, rmmCe); - extpix_mul(rmm2, rmmCm); - extpix_mul(rmm3, rmmCc); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *rowptr = unextend_pixel(rmm1); - -// ----------------------- process bitmap middle pixels - - dst += slCanvasWidth; - src += slModulo1; - - // for each row - for (size_t i = pixHeight-2; i != 0; i--) // rowLoop - { - rowptr = aulRows; - - // process left edge pixel - extend_pixel(src[-pixCanvasWidth], rmm1); - extend_pixel(src[(-pixCanvasWidth)+1], rmm2); - extend_pixel(src[0], rmm3); - extend_pixel(src[1], rmm4); - extend_pixel(src[pixCanvasWidth], rmm5); - extend_pixel(src[pixCanvasWidth+1], rmm6); - - extpix_add(rmm1, rmm5); - extpix_add(rmm2, rmm6); - extpix_mul(rmm1, rmmEch); - extpix_mul(rmm2, rmmEcl); - extpix_mul(rmm3, rmmEm); - extpix_mul(rmm4, rmmEe); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *(rowptr++) = unextend_pixel(rmm1); - src++; - dst++; - - // for each pixel in current row - for (size_t j = pixWidth-2; j != 0; j--) // pixLoop - { - // prepare upper convolution row - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[(-pixCanvasWidth)+1], rmm3); - - // prepare middle convolution row - extend_pixel(src[-1], rmm4); - extend_pixel(src[0], rmm5); - extend_pixel(src[1], rmm6); - - // free some registers - extpix_add(rmm1, rmm3); - extpix_add(rmm2, rmm4); - extpix_mul(rmm5, rmmMm); - - // prepare lower convolution row - extend_pixel(src[pixCanvasWidth-1], rmm3); - extend_pixel(src[pixCanvasWidth], rmm4); - extend_pixel(src[pixCanvasWidth+1], rmm7); - - // calc weightened value - extpix_add(rmm2, rmm6); - extpix_add(rmm1, rmm3); - extpix_add(rmm2, rmm4); - extpix_add(rmm1, rmm7); - extpix_mul(rmm2, rmmMe); - extpix_mul(rmm1, rmmMc); - extpix_add(rmm2, rmm5); - extpix_add(rmm1, rmm2); - - // calc and store wightened value - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *(rowptr++) = unextend_pixel(rmm1); - - // advance to next pixel - src++; - dst++; - } - - // process right edge pixel - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[-1], rmm3); - extend_pixel(src[0], rmm4); - extend_pixel(src[pixCanvasWidth-1], rmm5); - extend_pixel(src[pixCanvasWidth], rmm6); - - extpix_add(rmm1, rmm5); - extpix_add(rmm2, rmm6); - extpix_mul(rmm1, rmmEcl); - extpix_mul(rmm2, rmmEch); - extpix_mul(rmm3, rmmEe); - extpix_mul(rmm4, rmmEm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *rowptr = unextend_pixel(rmm1); - - // advance to next row - src += slModulo1; - dst += slModulo1; - } - - // ----------------------- process lower left corner - rowptr = aulRows; - extend_pixel(src[-pixCanvasWidth], rmm1); - extend_pixel(src[(-pixCanvasWidth)+1], rmm2); - extend_pixel(src[0], rmm3); - extend_pixel(src[1], rmm4); - - extpix_add(rmm1, rmm4); - extpix_mul(rmm1, rmmCe); - extpix_mul(rmm2, rmmCc); - extpix_mul(rmm3, rmmCm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - - src++; - dst++; - rowptr++; - - // ----------------------- process lower edge pixels - for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop - { - // for each pixel - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[(-pixCanvasWidth)+1], rmm3); - extend_pixel(src[-1], rmm4); - extend_pixel(src[0], rmm5); - extend_pixel(src[1], rmm6); - - extpix_add(rmm1, rmm3); - extpix_add(rmm4, rmm6); - extpix_mul(rmm1, rmmEcl); - extpix_mul(rmm2, rmmEe); - extpix_mul(rmm4, rmmEch); - extpix_mul(rmm5, rmmEm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_add(rmm1, rmm5); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - - // advance to next pixel - src++; - dst++; - rowptr++; - } - - // ----------------------- lower right corners - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[-1], rmm3); - extend_pixel(src[0], rmm4); - - extpix_add(rmm2, rmm3); - extpix_mul(rmm1, rmmCc); - extpix_mul(rmm2, rmmCe); - extpix_mul(rmm4, rmmCm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - #elif (defined __MSVC_INLINE__) __asm { cld @@ -2204,7 +1936,7 @@ lowerLoop: emms } -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) FB_pulSrc = pulSrc; FB_pulDst = pulDst; @@ -2537,7 +2269,264 @@ lowerLoop: ); #else - #error Write inline asm for your platform. + slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type + slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type + + ULONG *src = pulSrc; + ULONG *dst = pulDst; + ULONG *rowptr = aulRows; + + ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0}; + #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x); + EXTPIXFROMINT64(mmCm); + EXTPIXFROMINT64(mmCe); + EXTPIXFROMINT64(mmCc); + EXTPIXFROMINT64(mmEch); + EXTPIXFROMINT64(mmEcl); + EXTPIXFROMINT64(mmEe); + EXTPIXFROMINT64(mmEm); + EXTPIXFROMINT64(mmMm); + EXTPIXFROMINT64(mmMe); + EXTPIXFROMINT64(mmMc); + EXTPIXFROMINT64(mmAdd); + EXTPIXFROMINT64(mmInvDiv); + #undef EXTPIXFROMINT64 + + // ----------------------- process upper left corner + extend_pixel(src[0], rmm1); + extend_pixel(src[1], rmm2); + extend_pixel(src[pixCanvasWidth], rmm3); + extend_pixel(src[pixCanvasWidth+1], rmm4); + + extpix_add(rmm2, rmm3); + extpix_mul(rmm1, rmmCm); + extpix_mul(rmm2, rmmCe); + extpix_mul(rmm4, rmmCc); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *(rowptr++) = unextend_pixel(rmm1); + + src++; + + // ----------------------- process upper edge pixels + for (PIX i = pixWidth - 2; i != 0; i--) + { + extend_pixel(src[-1], rmm1); + extend_pixel(src[0], rmm2); + extend_pixel(src[1], rmm3); + extend_pixel(src[pixCanvasWidth-1], rmm4); + extend_pixel(src[pixCanvasWidth], rmm5); + extend_pixel(src[pixCanvasWidth+1], rmm6); + + extpix_add(rmm1, rmm3); + extpix_add(rmm4, rmm6); + extpix_mul(rmm1, rmmEch); + extpix_mul(rmm2, rmmEm); + extpix_mul(rmm4, rmmEcl); + extpix_mul(rmm5, rmmEe); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_add(rmm1, rmm5); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *(rowptr++) = unextend_pixel(rmm1); + src++; + } + + // ----------------------- process upper right corner + + extend_pixel(src[-1], rmm1); + extend_pixel(src[0], rmm2); + extend_pixel(src[pixCanvasWidth-1], rmm3); + extend_pixel(src[pixCanvasWidth], rmm4); + + extpix_add(rmm1, rmm4); + extpix_mul(rmm1, rmmCe); + extpix_mul(rmm2, rmmCm); + extpix_mul(rmm3, rmmCc); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *rowptr = unextend_pixel(rmm1); + +// ----------------------- process bitmap middle pixels + + dst += slCanvasWidth; + src += slModulo1; + + // for each row + for (size_t i = pixHeight-2; i != 0; i--) // rowLoop + { + rowptr = aulRows; + + // process left edge pixel + extend_pixel(src[-pixCanvasWidth], rmm1); + extend_pixel(src[(-pixCanvasWidth)+1], rmm2); + extend_pixel(src[0], rmm3); + extend_pixel(src[1], rmm4); + extend_pixel(src[pixCanvasWidth], rmm5); + extend_pixel(src[pixCanvasWidth+1], rmm6); + + extpix_add(rmm1, rmm5); + extpix_add(rmm2, rmm6); + extpix_mul(rmm1, rmmEch); + extpix_mul(rmm2, rmmEcl); + extpix_mul(rmm3, rmmEm); + extpix_mul(rmm4, rmmEe); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *(rowptr++) = unextend_pixel(rmm1); + src++; + dst++; + + // for each pixel in current row + for (size_t j = pixWidth-2; j != 0; j--) // pixLoop + { + // prepare upper convolution row + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[(-pixCanvasWidth)+1], rmm3); + + // prepare middle convolution row + extend_pixel(src[-1], rmm4); + extend_pixel(src[0], rmm5); + extend_pixel(src[1], rmm6); + + // free some registers + extpix_add(rmm1, rmm3); + extpix_add(rmm2, rmm4); + extpix_mul(rmm5, rmmMm); + + // prepare lower convolution row + extend_pixel(src[pixCanvasWidth-1], rmm3); + extend_pixel(src[pixCanvasWidth], rmm4); + extend_pixel(src[pixCanvasWidth+1], rmm7); + + // calc weightened value + extpix_add(rmm2, rmm6); + extpix_add(rmm1, rmm3); + extpix_add(rmm2, rmm4); + extpix_add(rmm1, rmm7); + extpix_mul(rmm2, rmmMe); + extpix_mul(rmm1, rmmMc); + extpix_add(rmm2, rmm5); + extpix_add(rmm1, rmm2); + + // calc and store wightened value + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *(rowptr++) = unextend_pixel(rmm1); + + // advance to next pixel + src++; + dst++; + } + + // process right edge pixel + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[-1], rmm3); + extend_pixel(src[0], rmm4); + extend_pixel(src[pixCanvasWidth-1], rmm5); + extend_pixel(src[pixCanvasWidth], rmm6); + + extpix_add(rmm1, rmm5); + extpix_add(rmm2, rmm6); + extpix_mul(rmm1, rmmEcl); + extpix_mul(rmm2, rmmEch); + extpix_mul(rmm3, rmmEe); + extpix_mul(rmm4, rmmEm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *rowptr = unextend_pixel(rmm1); + + // advance to next row + src += slModulo1; + dst += slModulo1; + } + + // ----------------------- process lower left corner + rowptr = aulRows; + extend_pixel(src[-pixCanvasWidth], rmm1); + extend_pixel(src[(-pixCanvasWidth)+1], rmm2); + extend_pixel(src[0], rmm3); + extend_pixel(src[1], rmm4); + + extpix_add(rmm1, rmm4); + extpix_mul(rmm1, rmmCe); + extpix_mul(rmm2, rmmCc); + extpix_mul(rmm3, rmmCm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + + src++; + dst++; + rowptr++; + + // ----------------------- process lower edge pixels + for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop + { + // for each pixel + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[(-pixCanvasWidth)+1], rmm3); + extend_pixel(src[-1], rmm4); + extend_pixel(src[0], rmm5); + extend_pixel(src[1], rmm6); + + extpix_add(rmm1, rmm3); + extpix_add(rmm4, rmm6); + extpix_mul(rmm1, rmmEcl); + extpix_mul(rmm2, rmmEe); + extpix_mul(rmm4, rmmEch); + extpix_mul(rmm5, rmmEm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_add(rmm1, rmm5); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + + // advance to next pixel + src++; + dst++; + rowptr++; + } + + // ----------------------- lower right corners + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[-1], rmm3); + extend_pixel(src[0], rmm4); + + extpix_add(rmm2, rmm3); + extpix_mul(rmm1, rmmCc); + extpix_mul(rmm2, rmmCe); + extpix_mul(rmm4, rmmCm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + #endif // all done (finally) diff --git a/Sources/Engine/Graphics/OpenGL.h b/Sources/Engine/Graphics/OpenGL.h index e803301..37b5038 100644 --- a/Sources/Engine/Graphics/OpenGL.h +++ b/Sources/Engine/Graphics/OpenGL.h @@ -89,20 +89,14 @@ extern void (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param); inline void glCOLOR( COLOR col) { /* rcg10052001 Platform-wrappers. */ -#if (defined USE_PORTABLE_C) - col = ( ((col << 24) ) | - ((col << 8) & 0x00FF0000) | - ((col >> 8) & 0x0000FF00) | - ((col >> 24) ) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,dword ptr [col] bswap eax mov dword ptr [col],eax } -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "bswapl %%eax \n\t" : "=a" (col) @@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col) ); #else - #error please define for your platform. + col = ( ((col << 24) ) | + ((col << 8) & 0x00FF0000) | + ((col >> 8) & 0x0000FF00) | + ((col >> 24) ) ); + #endif pglColor4ubv((GLubyte*)&col); diff --git a/Sources/Engine/Graphics/TextureEffects.cpp b/Sources/Engine/Graphics/TextureEffects.cpp index 994814e..91fc839 100644 --- a/Sources/Engine/Graphics/TextureEffects.cpp +++ b/Sources/Engine/Graphics/TextureEffects.cpp @@ -32,11 +32,9 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined USE_PORTABLE_C) -#define ASMOPT 0 -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) #define ASMOPT 1 -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) #define ASMOPT 1 #else #define ASMOPT 0 @@ -1285,8 +1283,7 @@ static void RenderWater(void) { // SUB-SAMPLING SLONG slHeightMapStep, slHeightRowStep; -#if ASMOPT == 1 - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { push ebx bsf ecx,D [_pixTexWidth] @@ -1357,7 +1354,7 @@ pixLoop: pop ebx } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) // rcg12152001 needed extra registers. :( _slHeightMapStep_renderWater = slHeightMapStep; _pixBaseWidth_renderWater = pixBaseWidth; @@ -1460,10 +1457,6 @@ pixLoop: "cc", "memory" ); - #else - #error fill in for your platform. - #endif - #else PIX pixPos, pixDU, pixDV; @@ -1626,7 +1619,7 @@ pixLoop2: pop ebx } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "bsfl %[pixBaseWidth], %%eax \n\t" "movl $32, %%edx \n\t" @@ -2146,7 +2139,7 @@ pixLoop4: pop ebx } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "bsfl %[pixBaseWidth], %%eax \n\t" "movl $32, %%edx \n\t" @@ -2976,7 +2969,7 @@ pixDone: pop ebx } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[slColumnModulo], %%edx \n\t" "movl %[slBufferMask], %%ecx \n\t" @@ -3119,7 +3112,7 @@ pixLoopF: jnz rowLoopF pop ebx } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) _pubHeat_RenderPlasmaFire = pubHeat; // ran out of registers. :/ __asm__ __volatile__ ( "movl %[slHeatRowStep], %%eax \n\t" diff --git a/Sources/Engine/Light/LayerMixer.cpp b/Sources/Engine/Light/LayerMixer.cpp index d2aff1d..26c0e51 100755 --- a/Sources/Engine/Light/LayerMixer.cpp +++ b/Sources/Engine/Light/LayerMixer.cpp @@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined USE_PORTABLE_C) - #define ASMOPT 0 -#elif (defined __MSVC_INLINE__) - #define ASMOPT 1 -#elif (defined __GNU_INLINE__) - #define ASMOPT 1 -#else - #define ASMOPT 0 -#endif - extern INDEX shd_bFineQuality; extern INDEX shd_iFiltering; extern INDEX shd_iDithering; @@ -290,8 +280,7 @@ void CLayerMixer::AddAmbientPoint(void) _slLightMax<<=7; _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -364,7 +353,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -439,10 +428,6 @@ skipPixel: : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline asm for your platform. - #endif - #else // !!! FIXME WARNING: I have not checked this code, and it could be @@ -496,8 +481,7 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask) _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -576,7 +560,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -660,10 +644,6 @@ skipPixel: "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif - #else // Portable C version... UBYTE* pubLayer = (UBYTE*)_pulLayer; @@ -723,8 +703,7 @@ void CLayerMixer::AddDiffusionPoint(void) _slLightMax<<=7; _slLightStep>>=1; -#if ASMOPT == 1 - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -796,7 +775,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -871,10 +850,6 @@ skipPixel: : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline assembly for your platform. - #endif - #else // for each pixel in the shadow map UBYTE* pubLayer = (UBYTE*)_pulLayer; @@ -929,8 +904,7 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask) _slLightMax<<=7; _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -1008,7 +982,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -1091,11 +1065,6 @@ skipPixel: "cc", "memory" ); - #else - #error Write inline ASM for your platform. - - #endif - #else // for each pixel in the shadow map @@ -1201,8 +1170,7 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask) FLOAT fDL2oDV = fDDL2oDV + 2*(lm_vStepV%v00); //_v00 = v00; -#if ((ASMOPT == 1) && (!defined __GNU_INLINE__)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { fld D [fDDL2oDU] fadd D [fDDL2oDU] @@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask) fistp D [_slDDL2oDV] fistp D [_slDDL2oDU] } - #elif (defined __GNU_INLINE__) - STUBBED("inline asm."); - #else - #error Please write inline assembly for your platform. - #endif - #else fDDL2oDU *= 2; fDDL2oDV *= 2; @@ -1321,8 +1283,7 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp) _pulLayer = lm_pulShadowMap; FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f); -#if ((ASMOPT == 1) && (!defined __GNU_INLINE__)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __int64 mmRowAdv; SLONG fixGRow = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15 SLONG slModulo = (lm_pixCanvasSizeU-lm_pixPolygonSizeU) *BYTES_PER_TEXEL; @@ -1436,14 +1397,6 @@ rowNext: rowDone: emms } - #elif (defined __GNU_INLINE__) - - STUBBED("WRITE ME. Argh."); - - #else - #error Need inline assembly for your platform. - #endif - #else // well, make gradient ... SLONG slR0=0,slG0=0,slB0=0; @@ -1528,9 +1481,8 @@ rowDone: // apply directional light or ambient to layer void CLayerMixer::AddDirectional(void) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) ULONG ulLight = ByteSwap( lm_colLight); - #if (defined __MSVC_INLINE__) __asm { // prepare pointers and variables mov edi,D [_pulLayer] @@ -1565,7 +1517,8 @@ rowNext: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) + ULONG ulLight = ByteSwap( lm_colLight); ULONG tmp; __asm__ __volatile__ ( // prepare pointers and variables @@ -1608,10 +1561,6 @@ rowNext: : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline assembly for your platform. - #endif - #else UBYTE* pubLayer = (UBYTE*)_pulLayer; // for each pixel in the shadow map @@ -1631,9 +1580,8 @@ rowNext: // apply directional light thru mask to layer void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) ULONG ulLight = ByteSwap( lm_colLight); - #if (defined __MSVC_INLINE__) // prepare some local variables __asm { // prepare pointers and variables @@ -1665,7 +1613,8 @@ skipLight: emms } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) + ULONG ulLight = ByteSwap( lm_colLight); ULONG tmp; __asm__ __volatile__ ( // prepare pointers and variables @@ -1706,10 +1655,6 @@ skipLight: "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif - #else UBYTE* pubLayer = (UBYTE*)_pulLayer; // for each pixel in the shadow map @@ -1832,7 +1777,33 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) } } // set initial color - #if (defined USE_PORTABLE_C) +#if (defined __MSVC_INLINE__) + __asm { + cld + mov ebx,D [this] + mov ecx,D [ebx].lm_pixCanvasSizeU + imul ecx,D [ebx].lm_pixCanvasSizeV + mov edi,D [ebx].lm_pulShadowMap + mov eax,D [colAmbient] + bswap eax + rep stosd + } + +#elif (defined __GNU_INLINE_X86_32__) + ULONG clob1, clob2, clob3; + __asm__ __volatile__ ( + "cld \n\t" + "imull %%esi, %%ecx \n\t" + "bswapl %%eax \n\t" + "rep \n\t" + "stosl \n\t" + : "=a" (clob1), "=c" (clob2), "=D" (clob3) + : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV), + "a" (colAmbient), "D" (this->lm_pulShadowMap) + : "cc", "memory" + ); + +#else register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV; #if PLATFORM_LITTLEENDIAN // Forces C fallback; BYTESWAP itself is a no-op on little endian. @@ -1850,35 +1821,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) ptr++; } - #elif (defined __MSVC_INLINE__) - __asm { - cld - mov ebx,D [this] - mov ecx,D [ebx].lm_pixCanvasSizeU - imul ecx,D [ebx].lm_pixCanvasSizeV - mov edi,D [ebx].lm_pulShadowMap - mov eax,D [colAmbient] - bswap eax - rep stosd - } - - #elif (defined __GNU_INLINE__) - ULONG clob1, clob2, clob3; - __asm__ __volatile__ ( - "cld \n\t" - "imull %%esi, %%ecx \n\t" - "bswapl %%eax \n\t" - "rep \n\t" - "stosl \n\t" - : "=a" (clob1), "=c" (clob2), "=D" (clob3) - : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV), - "a" (colAmbient), "D" (this->lm_pulShadowMap) - : "cc", "memory" - ); - - #else - #error Please write inline assembly for your platform. - #endif +#endif _pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL); @@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) // copy from static shadow map to dynamic layer __forceinline void CLayerMixer::CopyShadowLayer(void) { - #if (defined USE_PORTABLE_C) - memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4); - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov ebx,D [this] @@ -1967,7 +1908,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void) mov edi,D [ebx].lm_pulShadowMap rep movsd } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG clob1, clob2, clob3; __asm__ __volatile__ ( "cld \n\t" @@ -1980,21 +1921,16 @@ __forceinline void CLayerMixer::CopyShadowLayer(void) : "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif +#else + memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4); +#endif } // copy from static shadow map to dynamic layer __forceinline void CLayerMixer::FillShadowLayer( COLOR col) { - #if (defined USE_PORTABLE_C) - DWORD* dst = (DWORD*)lm_pulShadowMap; - int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV; - DWORD color = __builtin_bswap32(col); - while(n--) {*(dst++)=color;} - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov ebx,D [this] @@ -2006,7 +1942,7 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col) rep stosd } - #elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) ULONG clob1, clob2, clob3; __asm__ __volatile__ ( "cld \n\t" @@ -2020,9 +1956,12 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col) : "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif +#else + DWORD* dst = (DWORD*)lm_pulShadowMap; + int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV; + DWORD color = __builtin_bswap32(col); + while(n--) {*(dst++)=color;} +#endif } diff --git a/Sources/Engine/Math/Float.cpp b/Sources/Engine/Math/Float.cpp index 6c62b5f..9f04e16 100755 --- a/Sources/Engine/Math/Float.cpp +++ b/Sources/Engine/Math/Float.cpp @@ -24,20 +24,11 @@ with this program; if not, write to the Free Software Foundation, Inc., #define _PC_64 0x0300 // !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap. --ryan. -#ifdef USE_PORTABLE_C -// Fake control87 for USE_PORTABLE_C version -inline ULONG _control87(WORD newcw, WORD mask) -{ - static WORD fpw=_PC_64; - if (mask != 0) - { - fpw &= ~mask; - fpw |= (newcw & mask); - } - return(fpw); -} +#if (defined _MSC_VER) -#elif (defined __GNU_INLINE__) +// _control87 is provided by the compiler + +#elif (defined __GNU_INLINE_X86_32__) inline ULONG _control87(WORD newcw, WORD mask) { @@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask) return(fpw); } -#elif (!defined _MSC_VER) -#error Implement for your platform, or add a stub conditional here. +#else + +// Fake control87 for USE_PORTABLE_C version +inline ULONG _control87(WORD newcw, WORD mask) +{ + static WORD fpw=_PC_64; + if (mask != 0) + { + fpw &= ~mask; + fpw |= (newcw & mask); + } + return(fpw); +} + #endif /* Get current precision setting of FPU. */ diff --git a/Sources/Engine/Math/Functions.h b/Sources/Engine/Math/Functions.h index d164c0c..1108e71 100755 --- a/Sources/Engine/Math/Functions.h +++ b/Sources/Engine/Math/Functions.h @@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul) // fast float to int conversion inline SLONG FloatToInt( FLOAT f) { -#if defined(__arm__) || defined(USE_PORTABLE_C) - // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG - float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5 - return((SLONG) (f + addToRound)); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) SLONG slRet; __asm { fld D [f] @@ -325,7 +320,7 @@ inline SLONG FloatToInt( FLOAT f) } return slRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) SLONG slRet; __asm__ __volatile__ ( "flds (%%eax) \n\t" @@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f) ); return(slRet); #else - #error Fill this in for your platform. + // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG + float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5 + return((SLONG) (f + addToRound)); + #endif } // log base 2 of any float numero inline FLOAT Log2( FLOAT f) { -#if (defined USE_PORTABLE_C) || defined(__arm__) - return log2f(f); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) FLOAT fRet; _asm { fld1 @@ -355,7 +350,7 @@ inline FLOAT Log2( FLOAT f) { } return fRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) FLOAT fRet; __asm__ __volatile__ ( "fld1 \n\t" @@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) { ); return(fRet); #else - #error Fill this in for your platform. + return log2f(f); + #endif } @@ -376,8 +372,24 @@ inline FLOAT Log2( FLOAT f) { // returns accurate values only for integers that are power of 2 inline SLONG FastLog2( SLONG x) { -#if (defined USE_PORTABLE_C) -#ifdef __GNUC__ +#if (defined __MSVC_INLINE__) + SLONG slRet; + __asm { + bsr eax,D [x] + mov D [slRet],eax + } + return slRet; + +#elif (defined __GNU_INLINE_X86_32__) + SLONG slRet; + __asm__ __volatile__ ( + "bsrl %%ecx, %%eax \n\t" + : "=a" (slRet) + : "c" (x) + : "memory" + ); + return(slRet); +#elif (defined __GNUC__) if(x == 0) return 0; // __builtin_clz() is undefined for 0 int numLeadingZeros = __builtin_clz(x); return 31 - numLeadingZeros; @@ -393,38 +405,13 @@ inline SLONG FastLog2( SLONG x) return 0; #endif - -#elif (defined __MSVC_INLINE__) - SLONG slRet; - __asm { - bsr eax,D [x] - mov D [slRet],eax - } - return slRet; - -#elif (defined __GNU_INLINE__) - SLONG slRet; - __asm__ __volatile__ ( - "bsrl %%ecx, %%eax \n\t" - : "=a" (slRet) - : "c" (x) - : "memory" - ); - return(slRet); -#else - #error Fill this in for your platform. -#endif } /* DG: function is unused => doesn't matter that portable implementation is not optimal :) // returns log2 of first larger value that is a power of 2 inline SLONG FastMaxLog2( SLONG x) { -#if (defined USE_PORTABLE_C) -printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); - return((SLONG) log2((double) x)); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) SLONG slRet; __asm { bsr eax,D [x] @@ -435,7 +422,7 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); } return slRet; -#elif (defined __GNU_INLINE__) +#elif (defined __GNU_INLINE_X86_32__) SLONG slRet; __asm__ __volatile__ ( "bsrl %%ecx, %%eax \n\t" @@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); ); return(slRet); #else - #error Fill this in for your platform. +printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); + return((SLONG) log2((double) x)); + #endif } */ diff --git a/Sources/Engine/Models/RenderModel_View.cpp b/Sources/Engine/Models/RenderModel_View.cpp index 2b18dda..73fb5ef 100644 --- a/Sources/Engine/Models/RenderModel_View.cpp +++ b/Sources/Engine/Models/RenderModel_View.cpp @@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined __MSVC_INLINE__) -#define ASMOPT 1 -#elif (defined __GNU_INLINE__) -#define ASMOPT 0 // !!! FIXME: rcg10112001 Write GCC inline asm versions... -#else -#define ASMOPT 0 -#endif - extern BOOL CVA_bModels; extern BOOL GFX_bTruform; @@ -663,7 +655,7 @@ static FLOAT _fHazeAdd; // check vertex against fog static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { mov esi,D [vtx] mov edi,D [tex] @@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex) // check vertex against haze static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { mov esi,D [vtx] mov edi,D [tx1] @@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals) const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1; if( pFrame0==pFrame1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1196,7 +1188,7 @@ vtxNext16: // if lerping else { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1365,7 +1357,7 @@ vtxNext16L: // if no lerping if( pFrame0==pFrame1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1464,7 +1456,7 @@ vtxNext8: // if lerping else { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; // re-adjust stretching factors because of fixint lerping (divide by 256) @@ -1610,7 +1602,7 @@ vtxNext8L: } // generate colors from shades -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 // construct 64-bit RGBA light @@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm) pvtxSrfBase = &_avtxSrfBase[iSrfVx0]; INDEX iSrfVx; -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { push ebx mov ebx,D [puwSrfToMip] @@ -2074,7 +2066,7 @@ srfVtxLoop: const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation); colSrfDiff.MultiplyRGBA( colD, colMdlDiff); -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // setup texcoord array __asm { push ebx @@ -2134,7 +2126,7 @@ vtxEnd: for( INDEX iSrfVx=0; iSrfVxsl_SwfeFormat.nSamplesPerSec; // wipe destination mixer buffer - // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.) - #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - memset(pvMixerBuffer, 0, slMixerBufferSize * 8); - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld xor eax,eax @@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize) shl ecx,1 // *2 because of 32-bit src format rep stosd } - #elif (defined __GNU_INLINE__) - // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()? - ULONG clob1, clob2; - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "stosl \n\t" - : "=D" (clob1), "=c" (clob2) - : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2) - : "cc", "memory" - ); #else - #error please write inline asm for your platform. + memset(pvMixerBuffer, 0, slMixerBufferSize * 8); #endif } @@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL ASSERT( slBytes%4==0); if( slBytes<4) return; - #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.) - memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes); - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld mov esi,D [slSrcOffset] @@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL shr ecx,2 // bytes to samples per channel rep movsd } - #elif (defined __GNU_INLINE__) - // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()? - ULONG clob1, clob2, clob3; - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "movsl \n\t" - : "=S" (clob1), "=D" (clob2), "=c" (clob3) - : "S" (((char *)pvMixerBuffer) + slSrcOffset), - "D" (pDstBuffer), - "c" (slBytes >> 2) - : "cc", "memory" - ); #else - #error please write inline asm for your platform. + memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes); #endif } @@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON ASSERT( slBytes%2==0); if( slBytes<4) return; - #if (defined USE_PORTABLE_C) - // (This is untested, currently. --ryan.) - WORD *dest = (WORD *) pDstBuffer; - WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset ); - SLONG max = slBytes / 4; - for (SLONG i = 0; i < max; i++) { - *dest = *src; - dest++; // move 16 bits. - src+=2; // move 32 bits. - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { mov esi,D [slSrcOffset] add esi,D [pvMixerBuffer] @@ -184,7 +140,7 @@ copyLoop: jnz copyLoop } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[pvMixerBuffer], %%esi \n\t" "movl %[pDstBuffer], %%edi \n\t" @@ -204,7 +160,15 @@ copyLoop: ); #else - #error please write inline asm for your platform. + // (This is untested, currently. --ryan.) + WORD *dest = (WORD *) pDstBuffer; + WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset ); + SLONG max = slBytes / 4; + for (SLONG i = 0; i < max; i++) { + *dest = *src; + dest++; // move 16 bits. + src+=2; // move 32 bits. + } #endif } @@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes) ASSERT( slBytes%4==0); if( slBytes<4) return; - #if (defined USE_PORTABLE_C) - //STUBBED("ConvertMixerBuffer"); - SWORD *dest = (SWORD *) pvMixerBuffer; - SLONG *src = (SLONG *) pvMixerBuffer; - SLONG max = slBytes / 2; - int tmp; - for (SLONG i = 0; i < max; i++) { - tmp = *src; - if (tmp>32767) tmp=32767; - if (tmp<-32767) tmp=-32767; - *dest=tmp; - dest++; // move 16 bits. - src++; // move 32 bits. - } - - - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld mov esi,D [pvMixerBuffer] @@ -250,7 +197,7 @@ copyLoop: emms } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[pvMixerBuffer], %%esi \n\t" "movl %[pvMixerBuffer], %%edi \n\t" @@ -271,7 +218,20 @@ copyLoop: ); #else - #error please write inline asm for your platform. + + SWORD *dest = (SWORD *) pvMixerBuffer; + SLONG *src = (SLONG *) pvMixerBuffer; + SLONG max = slBytes / 2; + int tmp; + for (SLONG i = 0; i < max; i++) { + tmp = *src; + if (tmp>32767) tmp=32767; + if (tmp<-32767) tmp=-32767; + *dest=tmp; + dest++; // move 16 bits. + src++; // move 32 bits. + } + #endif } @@ -323,7 +283,7 @@ void NormalizeMixerBuffer( const FLOAT fNormStrength, const SLONG slBytes, FLOAT } -#ifdef __GNU_INLINE__ +#if (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM) // These are implemented in an external NASM file. extern "C" { void MixStereo_asm(CSoundObject *pso); @@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso) { _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); - #if (defined USE_PORTABLE_C) - // initialize some local vars - SLONG slLeftSample, slRightSample, slNextSample; - SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; - fixLeftOfs = (__int64)(fLeftOfs * 65536.0); - fixRightOfs = (__int64)(fRightOfs * 65536.0); - __int64 fixLeftStep = (__int64)(fLeftStep * 65536.0); - __int64 fixRightStep = (__int64)(fRightStep * 65536.0); - __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16; - mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor; - - SLONG slLeftVolume_ = slLeftVolume >> 16; - SLONG slRightVolume_ = slRightVolume >> 16; - - // loop thru source buffer - INDEX iCt = slMixerBufferSize; - FOREVER - { - // if left channel source sample came to end of sample buffer - if( fixLeftOfs >= fixSoundBufferSize) { - fixLeftOfs -= fixSoundBufferSize; - // if has no loop, end it - bEndOfSound = bNotLoop; - } - // if right channel source sample came to end of sample buffer - if( fixRightOfs >= fixSoundBufferSize) { - fixRightOfs -= fixSoundBufferSize; - // if has no loop, end it - bEndOfSound = bNotLoop; - } - // end of buffer? - if( iCt<=0 || bEndOfSound) break; - - // fetch one lineary interpolated sample on left channel - slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0]; - slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1]; - slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; - // fetch one lineary interpolated sample on right channel - slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0]; - slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1]; - slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; - - // filter samples - slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15; - slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15; - - // apply stereo volume to current sample - slLeftSample = (slLastLeftSample * slLeftVolume_) >>15; - slRightSample = (slLastRightSample * slRightVolume_)>>15; - - slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF); - slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF); - - // mix in current sample - slLeftSample += pslDstBuffer[0]; - slRightSample += pslDstBuffer[1]; - // upper clamp - if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD; - if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD; - // lower clamp - if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD; - if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD; - - // store samples (both channels) - pslDstBuffer[0] = slLeftSample; - pslDstBuffer[1] = slRightSample; - - // modify volume ` - slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF); - slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF); - - // advance to next sample - fixLeftOfs += fixLeftStep; - fixRightOfs += fixRightStep; - pslDstBuffer += 2; - iCt--; - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { // convert from floats to fixints 32:16 fld D [fLeftOfs] @@ -548,24 +430,11 @@ loopEnd: emms } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM) // This is implemented in an external NASM file. MixMono_asm(pso); #else - #error please write inline asm for your platform. - #endif - - _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER); -} - - -// mixes one stereo 16-bit signed sound to destination buffer -inline void MixStereo( CSoundObject *pso) -{ - _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); - - #if (defined USE_PORTABLE_C) // initialize some local vars SLONG slLeftSample, slRightSample, slNextSample; SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; @@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso) if( iCt<=0 || bEndOfSound) break; // fetch one lineary interpolated sample on left channel - slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0]; - slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2]; + slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0]; + slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1]; slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; // fetch one lineary interpolated sample on right channel - slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0]; - slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2]; + slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0]; + slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1]; slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; // filter samples @@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso) iCt--; } - #elif (defined __MSVC_INLINE__) + #endif + + _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER); +} + + +// mixes one stereo 16-bit signed sound to destination buffer +inline void MixStereo( CSoundObject *pso) +{ + _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); + + #if (defined __MSVC_INLINE__) __asm { // convert from floats to fixints 32:16 fld D [fLeftOfs] @@ -778,12 +658,88 @@ loopEnd: emms } - #elif (defined __GNU_INLINE__) + #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM) // This is implemented in an external NASM file. MixStereo_asm(pso); #else - #error please write inline asm for your platform. + // initialize some local vars + SLONG slLeftSample, slRightSample, slNextSample; + SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; + fixLeftOfs = (__int64)(fLeftOfs * 65536.0); + fixRightOfs = (__int64)(fRightOfs * 65536.0); + __int64 fixLeftStep = (__int64)(fLeftStep * 65536.0); + __int64 fixRightStep = (__int64)(fRightStep * 65536.0); + __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16; + mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor; + + SLONG slLeftVolume_ = slLeftVolume >> 16; + SLONG slRightVolume_ = slRightVolume >> 16; + + // loop thru source buffer + INDEX iCt = slMixerBufferSize; + FOREVER + { + // if left channel source sample came to end of sample buffer + if( fixLeftOfs >= fixSoundBufferSize) { + fixLeftOfs -= fixSoundBufferSize; + // if has no loop, end it + bEndOfSound = bNotLoop; + } + // if right channel source sample came to end of sample buffer + if( fixRightOfs >= fixSoundBufferSize) { + fixRightOfs -= fixSoundBufferSize; + // if has no loop, end it + bEndOfSound = bNotLoop; + } + // end of buffer? + if( iCt<=0 || bEndOfSound) break; + + // fetch one lineary interpolated sample on left channel + slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0]; + slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2]; + slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; + // fetch one lineary interpolated sample on right channel + slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0]; + slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2]; + slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; + + // filter samples + slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15; + slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15; + + // apply stereo volume to current sample + slLeftSample = (slLastLeftSample * slLeftVolume_) >>15; + slRightSample = (slLastRightSample * slRightVolume_)>>15; + + slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF); + slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF); + + // mix in current sample + slLeftSample += pslDstBuffer[0]; + slRightSample += pslDstBuffer[1]; + // upper clamp + if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD; + if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD; + // lower clamp + if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD; + if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD; + + // store samples (both channels) + pslDstBuffer[0] = slLeftSample; + pslDstBuffer[1] = slRightSample; + + // modify volume ` + slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF); + slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF); + + // advance to next sample + fixLeftOfs += fixLeftStep; + fixRightOfs += fixRightStep; + pslDstBuffer += 2; + iCt--; + } + #endif _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER); diff --git a/Sources/build-linux32.sh b/Sources/build-linux32.sh index 4f44315..f7bf3fa 100755 --- a/Sources/build-linux32.sh +++ b/Sources/build-linux32.sh @@ -14,10 +14,10 @@ cd $_ #ninja # This is the eventual path for amd64. -#cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE .. +#cmake -DCMAKE_BUILD_TYPE=Debug .. # Right now we force x86, though... -cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_ASM=TRUE .. +cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_NASM_ASM=TRUE .. make -j$NCPU diff --git a/Sources/build-linux64.sh b/Sources/build-linux64.sh index d81e4dd..44fe793 100755 --- a/Sources/build-linux64.sh +++ b/Sources/build-linux64.sh @@ -14,7 +14,7 @@ cd $_ #ninja # This is the eventual path for amd64. -cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE .. +cmake -DCMAKE_BUILD_TYPE=Debug .. # Right now we force x86, though... #cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 .. diff --git a/Sources/build-mac.sh b/Sources/build-mac.sh index a0311cf..1f01255 100755 --- a/Sources/build-mac.sh +++ b/Sources/build-mac.sh @@ -9,6 +9,6 @@ set -x rm -rf cmake-build mkdir $_ cd $_ -cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE .. +cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_NASM_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE .. make -j$NCPU diff --git a/Sources/build-mac64.sh b/Sources/build-mac64.sh index 52f295b..5265cdc 100755 --- a/Sources/build-mac64.sh +++ b/Sources/build-mac64.sh @@ -9,6 +9,6 @@ set -x rm -rf cmake-build mkdir $_ cd $_ -cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 -DUSE_I386_ASM=FALSE .. +cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 .. make -j$NCPU