From 1f70d4e242d96cac742e06a048774b70f9b06657 Mon Sep 17 00:00:00 2001 From: notaz Date: Sun, 24 Apr 2016 20:16:04 +0300 Subject: [PATCH] rework asm to always fall back to portable C code with this there is no need to worry about x86 asm switch for other platforms. --- Sources/Engine/Base/Base.h | 4 +- Sources/Engine/Base/Profiling.cpp | 26 +- Sources/Engine/Base/Timer.cpp | 2 +- Sources/Engine/Base/Types.h | 8 +- Sources/Engine/Engine.cpp | 19 +- Sources/Engine/Graphics/Color.cpp | 107 ++- Sources/Engine/Graphics/Color.h | 83 +- .../Engine/Graphics/DrawPort_RenderScene.cpp | 43 +- Sources/Engine/Graphics/Fog.cpp | 29 +- .../Engine/Graphics/Gfx_OpenGL_Textures.cpp | 52 +- Sources/Engine/Graphics/Graphics.cpp | 799 +++++++++--------- Sources/Engine/Graphics/OpenGL.h | 14 +- Sources/Engine/Graphics/TextureEffects.cpp | 13 +- Sources/Engine/Light/LayerMixer.cpp | 175 ++-- Sources/Engine/Math/Float.cpp | 31 +- Sources/Engine/Math/Functions.h | 65 +- Sources/Engine/Models/RenderModel_View.cpp | 32 +- Sources/Engine/Rendering/RendMisc.cpp | 8 +- Sources/Engine/Sound/SoundMixer.cpp | 298 +++---- 19 files changed, 790 insertions(+), 1018 deletions(-) diff --git a/Sources/Engine/Base/Base.h b/Sources/Engine/Base/Base.h index fd032f3..12151f1 100644 --- a/Sources/Engine/Base/Base.h +++ b/Sources/Engine/Base/Base.h @@ -65,9 +65,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #else #warning "UNKNOWN PLATFORM IDENTIFIED!!!!" #define PLATFORM_UNKNOWN 1 - #warning "USING PORTABLE C!!!" - #define USE_PORTABLE_C -#endif +#endif #if PLATFORM_LINUX || PLATFORM_MACOSX #ifndef PLATFORM_UNIX diff --git a/Sources/Engine/Base/Profiling.cpp b/Sources/Engine/Base/Profiling.cpp index 744c740..87bd3b7 100644 --- a/Sources/Engine/Base/Profiling.cpp +++ b/Sources/Engine/Base/Profiling.cpp @@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc., template class CStaticArray; template class CStaticArray; -#if (defined USE_PORTABLE_C) +#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__) #include #endif static inline __int64 ReadTSC_profile(void) { -#if (defined USE_PORTABLE_C) - #ifdef __arm__ - struct timespec tv; - clock_gettime(CLOCK_MONOTONIC, &tv); - return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) ); - #else - struct timeval tv; - gettimeofday(&tv, NULL); - return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) ); - #endif - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __int64 mmRet; __asm { rdtsc @@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void) return(mmRet); #else - #error Please implement for your platform/compiler. + #ifdef __arm__ + struct timespec tv; + clock_gettime(CLOCK_MONOTONIC, &tv); + return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) ); + #else + struct timeval tv; + gettimeofday(&tv, NULL); + return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) ); + #endif + #endif } diff --git a/Sources/Engine/Base/Timer.cpp b/Sources/Engine/Base/Timer.cpp index 5c77caa..8c02846 100755 --- a/Sources/Engine/Base/Timer.cpp +++ b/Sources/Engine/Base/Timer.cpp @@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #include // !!! FIXME: use SDL timer code instead and rdtsc never? -#if (USE_PORTABLE_C) +#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__) #define USE_GETTIMEOFDAY 1 #endif diff --git a/Sources/Engine/Base/Types.h b/Sources/Engine/Base/Types.h index bfa030f..fe1672a 100644 --- a/Sources/Engine/Base/Types.h +++ b/Sources/Engine/Base/Types.h @@ -229,10 +229,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); inline ULONG _rotl(ULONG ul, int bits) { - #if (defined USE_PORTABLE_C) - // DG: according to http://blog.regehr.org/archives/1063 this is fast - return (ul<>(-bits&31)); - #elif (defined __GNU_INLINE_X86_32__) + #if (defined __GNU_INLINE_X86_32__) // This, on the other hand, is wicked fast. :) __asm__ __volatile__ ( "roll %%cl, %%eax \n\t" @@ -254,7 +251,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*)); return(ul); #else - #error need inline asm for your platform. + // DG: according to http://blog.regehr.org/archives/1063 this is fast + return (ul<>(-bits&31)); #endif } diff --git a/Sources/Engine/Engine.cpp b/Sources/Engine/Engine.cpp index 1ce48b1..4ca653e 100644 --- a/Sources/Engine/Engine.cpp +++ b/Sources/Engine/Engine.cpp @@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReser static void DetectCPU(void) { -#if (defined USE_PORTABLE_C) // rcg10072001 - CPrintF(TRANSV(" (No CPU detection in this binary.)\n")); - -#else - char strVendor[12+1]; + char strVendor[12+1] = { 0 }; strVendor[12] = 0; - ULONG ulTFMS; - ULONG ulFeatures; + ULONG ulTFMS = 0; + ULONG ulFeatures = 0; #if (defined __MSVC_INLINE__) // test MMX presence and update flag @@ -181,10 +177,13 @@ static void DetectCPU(void) : "eax", "ecx", "edx", "memory" ); - #else - #error Please implement for your platform or define USE_PORTABLE_C. #endif + if (ulTFMS == 0) { + CPrintF(TRANSV(" (No CPU detection in this binary.)\n")); + return; + } + INDEX iType = (ulTFMS>>12)&0x3; INDEX iFamily = (ulTFMS>> 8)&0xF; INDEX iModel = (ulTFMS>> 4)&0xF; @@ -215,8 +214,6 @@ static void DetectCPU(void) sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6); if( !bMMX) FatalError( TRANS("MMX support required but not present!")); - -#endif // defined USE_PORTABLE_C } static void DetectCPUWrapper(void) diff --git a/Sources/Engine/Graphics/Color.cpp b/Sources/Engine/Graphics/Color.cpp index 1d88614..6a6519c 100644 --- a/Sources/Engine/Graphics/Color.cpp +++ b/Sources/Engine/Graphics/Color.cpp @@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2) if( col2==0xFFFFFFFF) return col1; if( col1==0 || col2==0) return 0; -#if (defined USE_PORTABLE_C) - // !!! FIXME: This...is not fast. - union - { - COLOR col; - UBYTE bytes[4]; - } conv1; - - union - { - COLOR col; - UBYTE bytes[4]; - } conv2; - - conv1.col = col1; - conv2.col = col2; - conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255); - conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255); - conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255); - conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255); - - return(conv1.col); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) COLOR colRet; __asm { xor ebx,ebx @@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2) return colRet; #else - #error please fill in inline assembly for your platform. -#endif -} - - -// fast color additon function - RES = clamp (1ST + 2ND) -COLOR AddColors( COLOR col1, COLOR col2) -{ - if( col1==0) return col2; - if( col2==0) return col1; - if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF; - COLOR colRet; - -#if (defined USE_PORTABLE_C) // !!! FIXME: This...is not fast. union { @@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2) COLOR col; UBYTE bytes[4]; } conv2; - #define MINVAL(a, b) ((a)>(b))?(b):(a) conv1.col = col1; conv2.col = col2; - conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255); - conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255); - conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255); - conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255); - #undef MINVAL + conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255); + conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255); + conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255); + conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255); - colRet = conv1.col; + return(conv1.col); +#endif +} -#elif (defined __MSVC_INLINE__) + +// fast color additon function - RES = clamp (1ST + 2ND) +COLOR AddColors( COLOR col1, COLOR col2) +{ + if( col1==0) return col2; + if( col2==0) return col1; + if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF; + COLOR colRet; + +#if (defined __MSVC_INLINE__) __asm { xor ebx,ebx mov esi,255 @@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2) ); #else - #error please fill in inline assembly for your platform. + // !!! FIXME: This...is not fast. + union + { + COLOR col; + UBYTE bytes[4]; + } conv1; + + union + { + COLOR col; + UBYTE bytes[4]; + } conv2; + #define MINVAL(a, b) ((a)>(b))?(b):(a) + + conv1.col = col1; + conv2.col = col2; + conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255); + conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255); + conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255); + conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255); + #undef MINVAL + + colRet = conv1.col; #endif return colRet; @@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2) // multiple conversion from OpenGL color to DirectX color extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct) { -#if (defined USE_PORTABLE_C) - //#error write me. - for (int i=0; i>16) | ((tmp&0x000000ff)<<16); - } - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,dword ptr [pulSrc] mov edi,dword ptr [pulDst] @@ -678,12 +665,12 @@ colSkip2: mov dword ptr [edi],eax colSkip1: } - -#elif (defined __GNU_INLINE_X86_32__) - STUBBED("convert to inline asm."); - #else - #error please fill in inline assembly for your platform. + for (int i=0; i>16) | ((tmp&0x000000ff)<<16); + } + #endif } diff --git a/Sources/Engine/Graphics/Color.h b/Sources/Engine/Graphics/Color.h index a7f048a..de0c666 100644 --- a/Sources/Engine/Graphics/Color.h +++ b/Sources/Engine/Graphics/Color.h @@ -204,19 +204,7 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito __forceinline ULONG ByteSwap( ULONG ul) { /* rcg10052001 Platform-wrappers. */ -#if (defined USE_PORTABLE_C) - ul = ( ((ul << 24) ) | - ((ul << 8) & 0x00FF0000) | - ((ul >> 8) & 0x0000FF00) | - ((ul >> 24) ) ); - - #if (defined PLATFORM_BIGENDIAN) - BYTESWAP(ul); // !!! FIXME: May not be right! - #endif - - return(ul); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [ul] @@ -234,16 +222,22 @@ __forceinline ULONG ByteSwap( ULONG ul) return(ul); #else - #error please define for your platform. + ul = ( ((ul << 24) ) | + ((ul << 8) & 0x00FF0000) | + ((ul >> 8) & 0x0000FF00) | + ((ul >> 24) ) ); + + #if (defined PLATFORM_BIGENDIAN) + BYTESWAP(ul); // !!! FIXME: May not be right! + #endif + + return(ul); #endif } __forceinline ULONG rgba2argb( ULONG ul) { -#if (defined USE_PORTABLE_C) - return( (ul << 24) | (ul >> 8) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [ul] @@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul) return ulRet; #else - #error please define for your platform. + return (ul << 24) | (ul >> 8); + #endif } __forceinline ULONG abgr2argb( COLOR col) { -#if (defined USE_PORTABLE_C) - // this could be simplified, this is just a safe conversion from asm code - col = ( ((col << 24) ) | - ((col << 8) & 0x00FF0000) | - ((col >> 8) & 0x0000FF00) | - ((col >> 24) ) ); - return( (col << 24) | (col >> 8) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) ULONG ulRet; __asm { mov eax,dword ptr [col] @@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col) return ulRet; #else - #error please define for your platform. + // this could be simplified, this is just a safe conversion from asm code + col = ( ((col << 24) ) | + ((col << 8) & 0x00FF0000) | + ((col >> 8) & 0x0000FF00) | + ((col >> 24) ) ); + return( (col << 24) | (col >> 8) ); + #endif } @@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct); // fast memory copy of ULONGs inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) { -#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - memcpy( pulDst, pulSrc, ctLongs*4); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov esi,dword ptr [pulSrc] @@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) mov ecx,dword ptr [ctLongs] rep movsd } - -#elif (defined __GNU_INLINE_X86_32__) - // I haven't benchmarked it, but in many cases, memcpy() becomes an - // inline (asm?) macro on GNU platforms, so this might not be a - // speed gain at all over the USE_PORTABLE_C version. - // You Have Been Warned. --ryan. - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "movsd \n\t" - : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs) - : "S" (pulSrc), "D" (pulDst), "c" (ctLongs) - : "cc", "memory" - ); - #else -# error Please fill this in for your platform. + memcpy( pulDst, pulSrc, ctLongs*4); #endif } @@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs) // fast memory set of ULONGs inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs) { -#if (defined USE_PORTABLE_C) - for( INDEX i=0; ispo_ctElements; INDEX *piDst = _aiElements.Push(ctElems); -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,D [pspo] mov ecx,D [ctElems] @@ -184,7 +173,7 @@ elemRest: mov D [edi],eax elemDone: } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[ctElems], %%ecx \n\t" "movl %[piDst], %%edi \n\t" @@ -219,11 +208,6 @@ elemDone: "cc", "memory" ); - #else - #error Please write inline ASM for your platform. - - #endif - #else const INDEX iVtx0Pass = pspo->spo_iVtx0Pass; const INDEX *piSrc = pspo->spo_piElements; @@ -495,9 +479,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) // determine maximum used groups ASSERT( _ctGroupsCount); -#if ASMOPT == 1 - - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,2 bsr ecx,D [_ctGroupsCount] @@ -505,7 +487,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) mov D [_ctGroupsCount],eax } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl $2, %%eax \n\t" "bsrl (%%esi), %%ecx \n\t" @@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst) : "eax", "ecx", "cc", "memory" ); - #else - #error Please write inline ASM for your platform. - - #endif - #else // emulate x86's bsr opcode...not fast. :/ register DWORD val = _ctGroupsCount; @@ -858,10 +835,7 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn continue; } -// !!! FIXME: rcg11232001 This inline conversion is broken. Use the -// !!! FIXME: rcg11232001 C version for now with GCC. -#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__) && (!defined __INTEL_COMPILER)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,D [pspo] mov edi,D [iMappingOffset] @@ -915,7 +889,7 @@ vtxLoop: /* // !!! FIXME: rcg11232001 This inline conversion is broken. Use the // !!! FIXME: rcg11232001 C version for now on Linux. - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) STUBBED("debug this"); __asm__ __volatile__ ( "0: \n\t" // vtxLoop @@ -956,11 +930,6 @@ vtxLoop: ); */ - #else - #error Please write inline ASM for your platform. - - #endif - #else // diffuse mapping diff --git a/Sources/Engine/Graphics/Fog.cpp b/Sources/Engine/Graphics/Fog.cpp index bf5fb30..2945cb4 100644 --- a/Sources/Engine/Graphics/Fog.cpp +++ b/Sources/Engine/Graphics/Fog.cpp @@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ) // need to upload from RGBA format const PIX pixTextureSize = pixSizeI*pixSizeJ; - #if (defined USE_PORTABLE_C) - const UBYTE* src = pubTexture; - DWORD* dst = (DWORD*)(pubTexture+pixTextureSize); - for (int i=0; i> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff ); - src++; - dst++; - } - - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov esi,D [pubTexture] mov edi,D [pubTexture] @@ -95,7 +84,7 @@ pixLoop: jnz pixLoop } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) __asm__ __volatile__ ( "movl %[pubTexture], %%esi \n\t" "movl %[pixTextureSize], %%ecx \n\t" @@ -115,10 +104,18 @@ pixLoop: : "eax", "ecx", "esi", "edi", "cc", "memory" ); - #else - #error Write inline ASM for your platform. +#else + const UBYTE* src = pubTexture; + DWORD* dst = (DWORD*)(pubTexture+pixTextureSize); + for (int i=0; i> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff ); + src++; + dst++; + } - #endif +#endif // determine internal format extern INDEX gap_bAllowGrayTextures; diff --git a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp index e694f7b..4ad76ab 100644 --- a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp +++ b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp @@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV, if( pixSizeV==0) pixSizeV=1; pixSize = pixSizeU*pixSizeV; - #if (defined USE_PORTABLE_C) - // Basically average every other pixel... - UWORD w = 0; - UBYTE *dptr = (UBYTE *) pulDst; - UBYTE *sptr = (UBYTE *) pulSrc; - #if 0 - pixSize *= 4; - for (PIX i = 0; i < pixSize; i++) - { - *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 ); - dptr++; - sptr += 2; - } - #else - for (PIX i = 0; i < pixSize; i++) - { - for (PIX j = 0; j < 4; j++) - { - *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 ); - dptr++; - sptr++; - } - sptr += 4; - } - #endif - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 mov esi,D [pulSrc] @@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV, ); #else - #error Please write inline ASM for your platform. + // Basically average every other pixel... + UWORD w = 0; + UBYTE *dptr = (UBYTE *) pulDst; + UBYTE *sptr = (UBYTE *) pulSrc; + #if 0 + pixSize *= 4; + for (PIX i = 0; i < pixSize; i++) + { + *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 ); + dptr++; + sptr += 2; + } + #else + for (PIX i = 0; i < pixSize; i++) + { + for (PIX j = 0; j < 4; j++) + { + *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 ); + dptr++; + sptr++; + } + sptr += 4; + } + #endif #endif // upload mipmap diff --git a/Sources/Engine/Graphics/Graphics.cpp b/Sources/Engine/Graphics/Graphics.cpp index 4693d8c..3d4be59 100644 --- a/Sources/Engine/Graphics/Graphics.cpp +++ b/Sources/Engine/Graphics/Graphics.cpp @@ -209,58 +209,7 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt if( bBilinear) // type of filtering? { // BILINEAR - #if (defined USE_PORTABLE_C) - UBYTE *src = (UBYTE *) pulSrcMipmap; - UBYTE *dest = (UBYTE *) pulDstMipmap; - for (int i = 0 ; i < pixHeight; i++) - { - for (int j = 0; j < pixWidth; j++) - { - // Grab pixels from image - UWORD upleft[4]; - UWORD upright[4]; - UWORD downleft[4]; - UWORD downright[4]; - upleft[0] = *(src + 0); - upleft[1] = *(src + 1); - upleft[2] = *(src + 2); - upleft[3] = *(src + 3); - upright[0] = *(src + 4); - upright[1] = *(src + 5); - upright[2] = *(src + 6); - upright[3] = *(src + 7); - - downleft[0] = *(src + pixWidth*8 + 0); - downleft[1] = *(src + pixWidth*8 + 1); - downleft[2] = *(src + pixWidth*8 + 2); - downleft[3] = *(src + pixWidth*8 + 3); - downright[0] = *(src + pixWidth*8 + 4); - downright[1] = *(src + pixWidth*8 + 5); - downright[2] = *(src + pixWidth*8 + 6); - downright[3] = *(src + pixWidth*8 + 7); - - UWORD answer[4]; - answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2; - answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2; - answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2; - answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2; - answer[0] /= 4; - answer[1] /= 4; - answer[2] /= 4; - answer[3] /= 4; - - *(dest + 0) = answer[0]; - *(dest + 1) = answer[1]; - *(dest + 2) = answer[2]; - *(dest + 3) = answer[3]; - - src += 8; - dest += 4; - } - src += 8*pixWidth; - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 mov ebx,D [pixWidth] @@ -346,43 +295,63 @@ pixLoopN: ); #else - #error Write inline asm for your platform. + UBYTE *src = (UBYTE *) pulSrcMipmap; + UBYTE *dest = (UBYTE *) pulDstMipmap; + for (int i = 0 ; i < pixHeight; i++) + { + for (int j = 0; j < pixWidth; j++) + { + // Grab pixels from image + UWORD upleft[4]; + UWORD upright[4]; + UWORD downleft[4]; + UWORD downright[4]; + upleft[0] = *(src + 0); + upleft[1] = *(src + 1); + upleft[2] = *(src + 2); + upleft[3] = *(src + 3); + upright[0] = *(src + 4); + upright[1] = *(src + 5); + upright[2] = *(src + 6); + upright[3] = *(src + 7); + + downleft[0] = *(src + pixWidth*8 + 0); + downleft[1] = *(src + pixWidth*8 + 1); + downleft[2] = *(src + pixWidth*8 + 2); + downleft[3] = *(src + pixWidth*8 + 3); + downright[0] = *(src + pixWidth*8 + 4); + downright[1] = *(src + pixWidth*8 + 5); + downright[2] = *(src + pixWidth*8 + 6); + downright[3] = *(src + pixWidth*8 + 7); + + UWORD answer[4]; + answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2; + answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2; + answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2; + answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2; + answer[0] /= 4; + answer[1] /= 4; + answer[2] /= 4; + answer[3] /= 4; + + *(dest + 0) = answer[0]; + *(dest + 1) = answer[1]; + *(dest + 2) = answer[2]; + *(dest + 3) = answer[3]; + + src += 8; + dest += 4; + } + src += 8*pixWidth; + } + #endif } else { // NEAREST-NEIGHBOUR but with border preserving ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL; - #if (defined USE_PORTABLE_C) - - PIX offset = 0; - ulRowModulo /= 4; - - for (int q = 0; q < 2; q++) - { - for (PIX i = pixHeight / 2; i > 0; i--) - { - for (PIX j = pixWidth / 2; j > 0; j--) - { - *pulDstMipmap = *(pulSrcMipmap + offset); - pulSrcMipmap += 2; - pulDstMipmap++; - } - - for (PIX j = pixWidth / 2; j > 0; j--) - { - *pulDstMipmap = *(pulSrcMipmap + offset + 1); - pulSrcMipmap += 2; - pulDstMipmap++; - } - - pulSrcMipmap += ulRowModulo; - } - - offset = pixWidth * 2; - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { xor ebx,ebx mov esi,D [pulSrcMipmap] @@ -493,7 +462,33 @@ fullEnd: ); #else - #error Write inline asm for your platform. + PIX offset = 0; + ulRowModulo /= 4; + + for (int q = 0; q < 2; q++) + { + for (PIX i = pixHeight / 2; i > 0; i--) + { + for (PIX j = pixWidth / 2; j > 0; j--) + { + *pulDstMipmap = *(pulSrcMipmap + offset); + pulSrcMipmap += 2; + pulDstMipmap++; + } + + for (PIX j = pixWidth / 2; j > 0; j--) + { + *pulDstMipmap = *(pulSrcMipmap + offset + 1); + pulSrcMipmap += 2; + pulDstMipmap++; + } + + pulSrcMipmap += ulRowModulo; + } + + offset = pixWidth * 2; + } + #endif } } @@ -649,7 +644,7 @@ __int64 mmShifter = 0; __int64 mmMask = 0; ULONG *pulDitherTable; -#ifdef USE_PORTABLE_C +#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__) extern const UBYTE *pubClipByte; // increment a byte without overflowing it static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd) @@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth // ------------------------------- ordered matrix dithering routine ditherOrder: -#if (defined USE_PORTABLE_C) - union uConv - { - ULONG val; - DWORD dwords[2]; - UWORD words[4]; - WORD iwords[4]; - UBYTE bytes[8]; - }; - for (int i=0; i>= mmShifter; } - dith.val &= mmMask; - uConv* src = (uConv*)(pulSrc+i*pixWidth); - uConv* dst = (uConv*)(pulDst+i*pixWidth); - for (int j=0; j>= mmShifter; } + dith.val &= mmMask; + uConv* src = (uConv*)(pulSrc+i*pixWidth); + uConv* dst = (uConv*)(pulDst+i*pixWidth); + for (int j=0; j>4; - p5.words[k] = (p1.words[k]*5)>>4; - p7.words[k] = (p1.words[k]*7)>>4; } - for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);} - for (int k=0; k<4; k++) { - IncrementByteWithClip( src[k + step] , p7.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]); - IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]); - } - } - } - #endif - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 mov esi,D [pulDst] @@ -1157,7 +1123,32 @@ allDoneE: ); #else - #error Write inline asm for your platform. + #if 1 //SEB doesn't works.... + for (int i=0; i>4; + p5.words[k] = (p1.words[k]*5)>>4; + p7.words[k] = (p1.words[k]*7)>>4; } + for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);} + for (int k=0; k<4; k++) { + IncrementByteWithClip( src[k + step] , p7.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]); + IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]); + } + } + } + #endif + #endif goto theEnd; @@ -1265,7 +1256,7 @@ extern "C" { } -#ifdef USE_PORTABLE_C +#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__) typedef SWORD ExtPix[4]; static inline void extpix_fromi64(ExtPix &pix, const __int64 i64) @@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI _mm_empty(); // we're done, clear out the MMX registers! -#elif (defined USE_PORTABLE_C) - slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type - slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type - - ULONG *src = pulSrc; - ULONG *dst = pulDst; - ULONG *rowptr = aulRows; - - ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0}; - #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x); - EXTPIXFROMINT64(mmCm); - EXTPIXFROMINT64(mmCe); - EXTPIXFROMINT64(mmCc); - EXTPIXFROMINT64(mmEch); - EXTPIXFROMINT64(mmEcl); - EXTPIXFROMINT64(mmEe); - EXTPIXFROMINT64(mmEm); - EXTPIXFROMINT64(mmMm); - EXTPIXFROMINT64(mmMe); - EXTPIXFROMINT64(mmMc); - EXTPIXFROMINT64(mmAdd); - EXTPIXFROMINT64(mmInvDiv); - #undef EXTPIXFROMINT64 - - // ----------------------- process upper left corner - extend_pixel(src[0], rmm1); - extend_pixel(src[1], rmm2); - extend_pixel(src[pixCanvasWidth], rmm3); - extend_pixel(src[pixCanvasWidth+1], rmm4); - - extpix_add(rmm2, rmm3); - extpix_mul(rmm1, rmmCm); - extpix_mul(rmm2, rmmCe); - extpix_mul(rmm4, rmmCc); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *(rowptr++) = unextend_pixel(rmm1); - - src++; - - // ----------------------- process upper edge pixels - for (PIX i = pixWidth - 2; i != 0; i--) - { - extend_pixel(src[-1], rmm1); - extend_pixel(src[0], rmm2); - extend_pixel(src[1], rmm3); - extend_pixel(src[pixCanvasWidth-1], rmm4); - extend_pixel(src[pixCanvasWidth], rmm5); - extend_pixel(src[pixCanvasWidth+1], rmm6); - - extpix_add(rmm1, rmm3); - extpix_add(rmm4, rmm6); - extpix_mul(rmm1, rmmEch); - extpix_mul(rmm2, rmmEm); - extpix_mul(rmm4, rmmEcl); - extpix_mul(rmm5, rmmEe); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_add(rmm1, rmm5); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *(rowptr++) = unextend_pixel(rmm1); - src++; - } - - // ----------------------- process upper right corner - - extend_pixel(src[-1], rmm1); - extend_pixel(src[0], rmm2); - extend_pixel(src[pixCanvasWidth-1], rmm3); - extend_pixel(src[pixCanvasWidth], rmm4); - - extpix_add(rmm1, rmm4); - extpix_mul(rmm1, rmmCe); - extpix_mul(rmm2, rmmCm); - extpix_mul(rmm3, rmmCc); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - *rowptr = unextend_pixel(rmm1); - -// ----------------------- process bitmap middle pixels - - dst += slCanvasWidth; - src += slModulo1; - - // for each row - for (size_t i = pixHeight-2; i != 0; i--) // rowLoop - { - rowptr = aulRows; - - // process left edge pixel - extend_pixel(src[-pixCanvasWidth], rmm1); - extend_pixel(src[(-pixCanvasWidth)+1], rmm2); - extend_pixel(src[0], rmm3); - extend_pixel(src[1], rmm4); - extend_pixel(src[pixCanvasWidth], rmm5); - extend_pixel(src[pixCanvasWidth+1], rmm6); - - extpix_add(rmm1, rmm5); - extpix_add(rmm2, rmm6); - extpix_mul(rmm1, rmmEch); - extpix_mul(rmm2, rmmEcl); - extpix_mul(rmm3, rmmEm); - extpix_mul(rmm4, rmmEe); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *(rowptr++) = unextend_pixel(rmm1); - src++; - dst++; - - // for each pixel in current row - for (size_t j = pixWidth-2; j != 0; j--) // pixLoop - { - // prepare upper convolution row - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[(-pixCanvasWidth)+1], rmm3); - - // prepare middle convolution row - extend_pixel(src[-1], rmm4); - extend_pixel(src[0], rmm5); - extend_pixel(src[1], rmm6); - - // free some registers - extpix_add(rmm1, rmm3); - extpix_add(rmm2, rmm4); - extpix_mul(rmm5, rmmMm); - - // prepare lower convolution row - extend_pixel(src[pixCanvasWidth-1], rmm3); - extend_pixel(src[pixCanvasWidth], rmm4); - extend_pixel(src[pixCanvasWidth+1], rmm7); - - // calc weightened value - extpix_add(rmm2, rmm6); - extpix_add(rmm1, rmm3); - extpix_add(rmm2, rmm4); - extpix_add(rmm1, rmm7); - extpix_mul(rmm2, rmmMe); - extpix_mul(rmm1, rmmMc); - extpix_add(rmm2, rmm5); - extpix_add(rmm1, rmm2); - - // calc and store wightened value - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *(rowptr++) = unextend_pixel(rmm1); - - // advance to next pixel - src++; - dst++; - } - - // process right edge pixel - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[-1], rmm3); - extend_pixel(src[0], rmm4); - extend_pixel(src[pixCanvasWidth-1], rmm5); - extend_pixel(src[pixCanvasWidth], rmm6); - - extpix_add(rmm1, rmm5); - extpix_add(rmm2, rmm6); - extpix_mul(rmm1, rmmEcl); - extpix_mul(rmm2, rmmEch); - extpix_mul(rmm3, rmmEe); - extpix_mul(rmm4, rmmEm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - *rowptr = unextend_pixel(rmm1); - - // advance to next row - src += slModulo1; - dst += slModulo1; - } - - // ----------------------- process lower left corner - rowptr = aulRows; - extend_pixel(src[-pixCanvasWidth], rmm1); - extend_pixel(src[(-pixCanvasWidth)+1], rmm2); - extend_pixel(src[0], rmm3); - extend_pixel(src[1], rmm4); - - extpix_add(rmm1, rmm4); - extpix_mul(rmm1, rmmCe); - extpix_mul(rmm2, rmmCc); - extpix_mul(rmm3, rmmCm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm3); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - - src++; - dst++; - rowptr++; - - // ----------------------- process lower edge pixels - for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop - { - // for each pixel - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[(-pixCanvasWidth)+1], rmm3); - extend_pixel(src[-1], rmm4); - extend_pixel(src[0], rmm5); - extend_pixel(src[1], rmm6); - - extpix_add(rmm1, rmm3); - extpix_add(rmm4, rmm6); - extpix_mul(rmm1, rmmEcl); - extpix_mul(rmm2, rmmEe); - extpix_mul(rmm4, rmmEch); - extpix_mul(rmm5, rmmEm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_add(rmm1, rmm5); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - - // advance to next pixel - src++; - dst++; - rowptr++; - } - - // ----------------------- lower right corners - extend_pixel(src[(-pixCanvasWidth)-1], rmm1); - extend_pixel(src[-pixCanvasWidth], rmm2); - extend_pixel(src[-1], rmm3); - extend_pixel(src[0], rmm4); - - extpix_add(rmm2, rmm3); - extpix_mul(rmm1, rmmCc); - extpix_mul(rmm2, rmmCe); - extpix_mul(rmm4, rmmCm); - extpix_add(rmm1, rmm2); - extpix_add(rmm1, rmm4); - extpix_adds(rmm1, rmmAdd); - extpix_mulhi(rmm1, rmmInvDiv); - dst[-pixCanvasWidth] = *rowptr; - dst[0] = unextend_pixel(rmm1); - #elif (defined __MSVC_INLINE__) __asm { cld @@ -2537,7 +2269,264 @@ lowerLoop: ); #else - #error Write inline asm for your platform. + slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type + slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type + + ULONG *src = pulSrc; + ULONG *dst = pulDst; + ULONG *rowptr = aulRows; + + ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0}; + #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x); + EXTPIXFROMINT64(mmCm); + EXTPIXFROMINT64(mmCe); + EXTPIXFROMINT64(mmCc); + EXTPIXFROMINT64(mmEch); + EXTPIXFROMINT64(mmEcl); + EXTPIXFROMINT64(mmEe); + EXTPIXFROMINT64(mmEm); + EXTPIXFROMINT64(mmMm); + EXTPIXFROMINT64(mmMe); + EXTPIXFROMINT64(mmMc); + EXTPIXFROMINT64(mmAdd); + EXTPIXFROMINT64(mmInvDiv); + #undef EXTPIXFROMINT64 + + // ----------------------- process upper left corner + extend_pixel(src[0], rmm1); + extend_pixel(src[1], rmm2); + extend_pixel(src[pixCanvasWidth], rmm3); + extend_pixel(src[pixCanvasWidth+1], rmm4); + + extpix_add(rmm2, rmm3); + extpix_mul(rmm1, rmmCm); + extpix_mul(rmm2, rmmCe); + extpix_mul(rmm4, rmmCc); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *(rowptr++) = unextend_pixel(rmm1); + + src++; + + // ----------------------- process upper edge pixels + for (PIX i = pixWidth - 2; i != 0; i--) + { + extend_pixel(src[-1], rmm1); + extend_pixel(src[0], rmm2); + extend_pixel(src[1], rmm3); + extend_pixel(src[pixCanvasWidth-1], rmm4); + extend_pixel(src[pixCanvasWidth], rmm5); + extend_pixel(src[pixCanvasWidth+1], rmm6); + + extpix_add(rmm1, rmm3); + extpix_add(rmm4, rmm6); + extpix_mul(rmm1, rmmEch); + extpix_mul(rmm2, rmmEm); + extpix_mul(rmm4, rmmEcl); + extpix_mul(rmm5, rmmEe); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_add(rmm1, rmm5); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *(rowptr++) = unextend_pixel(rmm1); + src++; + } + + // ----------------------- process upper right corner + + extend_pixel(src[-1], rmm1); + extend_pixel(src[0], rmm2); + extend_pixel(src[pixCanvasWidth-1], rmm3); + extend_pixel(src[pixCanvasWidth], rmm4); + + extpix_add(rmm1, rmm4); + extpix_mul(rmm1, rmmCe); + extpix_mul(rmm2, rmmCm); + extpix_mul(rmm3, rmmCc); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + *rowptr = unextend_pixel(rmm1); + +// ----------------------- process bitmap middle pixels + + dst += slCanvasWidth; + src += slModulo1; + + // for each row + for (size_t i = pixHeight-2; i != 0; i--) // rowLoop + { + rowptr = aulRows; + + // process left edge pixel + extend_pixel(src[-pixCanvasWidth], rmm1); + extend_pixel(src[(-pixCanvasWidth)+1], rmm2); + extend_pixel(src[0], rmm3); + extend_pixel(src[1], rmm4); + extend_pixel(src[pixCanvasWidth], rmm5); + extend_pixel(src[pixCanvasWidth+1], rmm6); + + extpix_add(rmm1, rmm5); + extpix_add(rmm2, rmm6); + extpix_mul(rmm1, rmmEch); + extpix_mul(rmm2, rmmEcl); + extpix_mul(rmm3, rmmEm); + extpix_mul(rmm4, rmmEe); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *(rowptr++) = unextend_pixel(rmm1); + src++; + dst++; + + // for each pixel in current row + for (size_t j = pixWidth-2; j != 0; j--) // pixLoop + { + // prepare upper convolution row + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[(-pixCanvasWidth)+1], rmm3); + + // prepare middle convolution row + extend_pixel(src[-1], rmm4); + extend_pixel(src[0], rmm5); + extend_pixel(src[1], rmm6); + + // free some registers + extpix_add(rmm1, rmm3); + extpix_add(rmm2, rmm4); + extpix_mul(rmm5, rmmMm); + + // prepare lower convolution row + extend_pixel(src[pixCanvasWidth-1], rmm3); + extend_pixel(src[pixCanvasWidth], rmm4); + extend_pixel(src[pixCanvasWidth+1], rmm7); + + // calc weightened value + extpix_add(rmm2, rmm6); + extpix_add(rmm1, rmm3); + extpix_add(rmm2, rmm4); + extpix_add(rmm1, rmm7); + extpix_mul(rmm2, rmmMe); + extpix_mul(rmm1, rmmMc); + extpix_add(rmm2, rmm5); + extpix_add(rmm1, rmm2); + + // calc and store wightened value + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *(rowptr++) = unextend_pixel(rmm1); + + // advance to next pixel + src++; + dst++; + } + + // process right edge pixel + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[-1], rmm3); + extend_pixel(src[0], rmm4); + extend_pixel(src[pixCanvasWidth-1], rmm5); + extend_pixel(src[pixCanvasWidth], rmm6); + + extpix_add(rmm1, rmm5); + extpix_add(rmm2, rmm6); + extpix_mul(rmm1, rmmEcl); + extpix_mul(rmm2, rmmEch); + extpix_mul(rmm3, rmmEe); + extpix_mul(rmm4, rmmEm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + *rowptr = unextend_pixel(rmm1); + + // advance to next row + src += slModulo1; + dst += slModulo1; + } + + // ----------------------- process lower left corner + rowptr = aulRows; + extend_pixel(src[-pixCanvasWidth], rmm1); + extend_pixel(src[(-pixCanvasWidth)+1], rmm2); + extend_pixel(src[0], rmm3); + extend_pixel(src[1], rmm4); + + extpix_add(rmm1, rmm4); + extpix_mul(rmm1, rmmCe); + extpix_mul(rmm2, rmmCc); + extpix_mul(rmm3, rmmCm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm3); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + + src++; + dst++; + rowptr++; + + // ----------------------- process lower edge pixels + for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop + { + // for each pixel + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[(-pixCanvasWidth)+1], rmm3); + extend_pixel(src[-1], rmm4); + extend_pixel(src[0], rmm5); + extend_pixel(src[1], rmm6); + + extpix_add(rmm1, rmm3); + extpix_add(rmm4, rmm6); + extpix_mul(rmm1, rmmEcl); + extpix_mul(rmm2, rmmEe); + extpix_mul(rmm4, rmmEch); + extpix_mul(rmm5, rmmEm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_add(rmm1, rmm5); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + + // advance to next pixel + src++; + dst++; + rowptr++; + } + + // ----------------------- lower right corners + extend_pixel(src[(-pixCanvasWidth)-1], rmm1); + extend_pixel(src[-pixCanvasWidth], rmm2); + extend_pixel(src[-1], rmm3); + extend_pixel(src[0], rmm4); + + extpix_add(rmm2, rmm3); + extpix_mul(rmm1, rmmCc); + extpix_mul(rmm2, rmmCe); + extpix_mul(rmm4, rmmCm); + extpix_add(rmm1, rmm2); + extpix_add(rmm1, rmm4); + extpix_adds(rmm1, rmmAdd); + extpix_mulhi(rmm1, rmmInvDiv); + dst[-pixCanvasWidth] = *rowptr; + dst[0] = unextend_pixel(rmm1); + #endif // all done (finally) diff --git a/Sources/Engine/Graphics/OpenGL.h b/Sources/Engine/Graphics/OpenGL.h index fe3f137..37b5038 100644 --- a/Sources/Engine/Graphics/OpenGL.h +++ b/Sources/Engine/Graphics/OpenGL.h @@ -89,13 +89,7 @@ extern void (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param); inline void glCOLOR( COLOR col) { /* rcg10052001 Platform-wrappers. */ -#if (defined USE_PORTABLE_C) - col = ( ((col << 24) ) | - ((col << 8) & 0x00FF0000) | - ((col >> 8) & 0x0000FF00) | - ((col >> 24) ) ); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { mov eax,dword ptr [col] bswap eax @@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col) ); #else - #error please define for your platform. + col = ( ((col << 24) ) | + ((col << 8) & 0x00FF0000) | + ((col >> 8) & 0x0000FF00) | + ((col >> 24) ) ); + #endif pglColor4ubv((GLubyte*)&col); diff --git a/Sources/Engine/Graphics/TextureEffects.cpp b/Sources/Engine/Graphics/TextureEffects.cpp index b50a33e..91fc839 100644 --- a/Sources/Engine/Graphics/TextureEffects.cpp +++ b/Sources/Engine/Graphics/TextureEffects.cpp @@ -32,9 +32,7 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined USE_PORTABLE_C) -#define ASMOPT 0 -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) #define ASMOPT 1 #elif (defined __GNU_INLINE_X86_32__) #define ASMOPT 1 @@ -1285,8 +1283,7 @@ static void RenderWater(void) { // SUB-SAMPLING SLONG slHeightMapStep, slHeightRowStep; -#if ASMOPT == 1 - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { push ebx bsf ecx,D [_pixTexWidth] @@ -1357,7 +1354,7 @@ pixLoop: pop ebx } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) // rcg12152001 needed extra registers. :( _slHeightMapStep_renderWater = slHeightMapStep; _pixBaseWidth_renderWater = pixBaseWidth; @@ -1460,10 +1457,6 @@ pixLoop: "cc", "memory" ); - #else - #error fill in for your platform. - #endif - #else PIX pixPos, pixDU, pixDV; diff --git a/Sources/Engine/Light/LayerMixer.cpp b/Sources/Engine/Light/LayerMixer.cpp index 0f1b8d0..26c0e51 100755 --- a/Sources/Engine/Light/LayerMixer.cpp +++ b/Sources/Engine/Light/LayerMixer.cpp @@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined USE_PORTABLE_C) - #define ASMOPT 0 -#elif (defined __MSVC_INLINE__) - #define ASMOPT 1 -#elif (defined __GNU_INLINE_X86_32__) - #define ASMOPT 1 -#else - #define ASMOPT 0 -#endif - extern INDEX shd_bFineQuality; extern INDEX shd_iFiltering; extern INDEX shd_iDithering; @@ -290,8 +280,7 @@ void CLayerMixer::AddAmbientPoint(void) _slLightMax<<=7; _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -364,7 +353,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -439,10 +428,6 @@ skipPixel: : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline asm for your platform. - #endif - #else // !!! FIXME WARNING: I have not checked this code, and it could be @@ -496,8 +481,7 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask) _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -576,7 +560,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -660,10 +644,6 @@ skipPixel: "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif - #else // Portable C version... UBYTE* pubLayer = (UBYTE*)_pulLayer; @@ -723,8 +703,7 @@ void CLayerMixer::AddDiffusionPoint(void) _slLightMax<<=7; _slLightStep>>=1; -#if ASMOPT == 1 - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -796,7 +775,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -871,10 +850,6 @@ skipPixel: : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline assembly for your platform. - #endif - #else // for each pixel in the shadow map UBYTE* pubLayer = (UBYTE*)_pulLayer; @@ -929,8 +904,7 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask) _slLightMax<<=7; _slLightStep>>=1; -#if (ASMOPT == 1) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { // prepare interpolants movd mm0,D [_slL2Row] @@ -1008,7 +982,7 @@ skipPixel: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG tmp1, tmp2; __asm__ __volatile__ ( // prepare interpolants @@ -1091,11 +1065,6 @@ skipPixel: "cc", "memory" ); - #else - #error Write inline ASM for your platform. - - #endif - #else // for each pixel in the shadow map @@ -1201,8 +1170,7 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask) FLOAT fDL2oDV = fDDL2oDV + 2*(lm_vStepV%v00); //_v00 = v00; -#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { fld D [fDDL2oDU] fadd D [fDDL2oDU] @@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask) fistp D [_slDDL2oDV] fistp D [_slDDL2oDU] } - #elif (defined __GNU_INLINE_X86_32__) - STUBBED("inline asm."); - #else - #error Please write inline assembly for your platform. - #endif - #else fDDL2oDU *= 2; fDDL2oDV *= 2; @@ -1321,8 +1283,7 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp) _pulLayer = lm_pulShadowMap; FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f); -#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__)) - #if (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __int64 mmRowAdv; SLONG fixGRow = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15 SLONG slModulo = (lm_pixCanvasSizeU-lm_pixPolygonSizeU) *BYTES_PER_TEXEL; @@ -1436,14 +1397,6 @@ rowNext: rowDone: emms } - #elif (defined __GNU_INLINE_X86_32__) - - STUBBED("WRITE ME. Argh."); - - #else - #error Need inline assembly for your platform. - #endif - #else // well, make gradient ... SLONG slR0=0,slG0=0,slB0=0; @@ -1528,9 +1481,8 @@ rowDone: // apply directional light or ambient to layer void CLayerMixer::AddDirectional(void) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) ULONG ulLight = ByteSwap( lm_colLight); - #if (defined __MSVC_INLINE__) __asm { // prepare pointers and variables mov edi,D [_pulLayer] @@ -1565,7 +1517,8 @@ rowNext: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) + ULONG ulLight = ByteSwap( lm_colLight); ULONG tmp; __asm__ __volatile__ ( // prepare pointers and variables @@ -1608,10 +1561,6 @@ rowNext: : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory" ); - #else - #error Write inline assembly for your platform. - #endif - #else UBYTE* pubLayer = (UBYTE*)_pulLayer; // for each pixel in the shadow map @@ -1631,9 +1580,8 @@ rowNext: // apply directional light thru mask to layer void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) ULONG ulLight = ByteSwap( lm_colLight); - #if (defined __MSVC_INLINE__) // prepare some local variables __asm { // prepare pointers and variables @@ -1665,7 +1613,8 @@ skipLight: emms } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) + ULONG ulLight = ByteSwap( lm_colLight); ULONG tmp; __asm__ __volatile__ ( // prepare pointers and variables @@ -1706,10 +1655,6 @@ skipLight: "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif - #else UBYTE* pubLayer = (UBYTE*)_pulLayer; // for each pixel in the shadow map @@ -1832,7 +1777,33 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) } } // set initial color - #if (defined USE_PORTABLE_C) +#if (defined __MSVC_INLINE__) + __asm { + cld + mov ebx,D [this] + mov ecx,D [ebx].lm_pixCanvasSizeU + imul ecx,D [ebx].lm_pixCanvasSizeV + mov edi,D [ebx].lm_pulShadowMap + mov eax,D [colAmbient] + bswap eax + rep stosd + } + +#elif (defined __GNU_INLINE_X86_32__) + ULONG clob1, clob2, clob3; + __asm__ __volatile__ ( + "cld \n\t" + "imull %%esi, %%ecx \n\t" + "bswapl %%eax \n\t" + "rep \n\t" + "stosl \n\t" + : "=a" (clob1), "=c" (clob2), "=D" (clob3) + : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV), + "a" (colAmbient), "D" (this->lm_pulShadowMap) + : "cc", "memory" + ); + +#else register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV; #if PLATFORM_LITTLEENDIAN // Forces C fallback; BYTESWAP itself is a no-op on little endian. @@ -1850,35 +1821,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) ptr++; } - #elif (defined __MSVC_INLINE__) - __asm { - cld - mov ebx,D [this] - mov ecx,D [ebx].lm_pixCanvasSizeU - imul ecx,D [ebx].lm_pixCanvasSizeV - mov edi,D [ebx].lm_pulShadowMap - mov eax,D [colAmbient] - bswap eax - rep stosd - } - - #elif (defined __GNU_INLINE_X86_32__) - ULONG clob1, clob2, clob3; - __asm__ __volatile__ ( - "cld \n\t" - "imull %%esi, %%ecx \n\t" - "bswapl %%eax \n\t" - "rep \n\t" - "stosl \n\t" - : "=a" (clob1), "=c" (clob2), "=D" (clob3) - : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV), - "a" (colAmbient), "D" (this->lm_pulShadowMap) - : "cc", "memory" - ); - - #else - #error Please write inline assembly for your platform. - #endif +#endif _pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL); @@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap) // copy from static shadow map to dynamic layer __forceinline void CLayerMixer::CopyShadowLayer(void) { - #if (defined USE_PORTABLE_C) - memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4); - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov ebx,D [this] @@ -1967,7 +1908,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void) mov edi,D [ebx].lm_pulShadowMap rep movsd } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG clob1, clob2, clob3; __asm__ __volatile__ ( "cld \n\t" @@ -1980,21 +1921,16 @@ __forceinline void CLayerMixer::CopyShadowLayer(void) : "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif +#else + memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4); +#endif } // copy from static shadow map to dynamic layer __forceinline void CLayerMixer::FillShadowLayer( COLOR col) { - #if (defined USE_PORTABLE_C) - DWORD* dst = (DWORD*)lm_pulShadowMap; - int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV; - DWORD color = __builtin_bswap32(col); - while(n--) {*(dst++)=color;} - #elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) __asm { cld mov ebx,D [this] @@ -2006,7 +1942,7 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col) rep stosd } - #elif (defined __GNU_INLINE_X86_32__) +#elif (defined __GNU_INLINE_X86_32__) ULONG clob1, clob2, clob3; __asm__ __volatile__ ( "cld \n\t" @@ -2020,9 +1956,12 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col) : "cc", "memory" ); - #else - #error Please write inline assembly for your platform. - #endif +#else + DWORD* dst = (DWORD*)lm_pulShadowMap; + int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV; + DWORD color = __builtin_bswap32(col); + while(n--) {*(dst++)=color;} +#endif } diff --git a/Sources/Engine/Math/Float.cpp b/Sources/Engine/Math/Float.cpp index 7c9f0fc..9f04e16 100755 --- a/Sources/Engine/Math/Float.cpp +++ b/Sources/Engine/Math/Float.cpp @@ -24,18 +24,9 @@ with this program; if not, write to the Free Software Foundation, Inc., #define _PC_64 0x0300 // !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap. --ryan. -#ifdef USE_PORTABLE_C -// Fake control87 for USE_PORTABLE_C version -inline ULONG _control87(WORD newcw, WORD mask) -{ - static WORD fpw=_PC_64; - if (mask != 0) - { - fpw &= ~mask; - fpw |= (newcw & mask); - } - return(fpw); -} +#if (defined _MSC_VER) + +// _control87 is provided by the compiler #elif (defined __GNU_INLINE_X86_32__) @@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask) return(fpw); } -#elif (!defined _MSC_VER) -#error Implement for your platform, or add a stub conditional here. +#else + +// Fake control87 for USE_PORTABLE_C version +inline ULONG _control87(WORD newcw, WORD mask) +{ + static WORD fpw=_PC_64; + if (mask != 0) + { + fpw &= ~mask; + fpw |= (newcw & mask); + } + return(fpw); +} + #endif /* Get current precision setting of FPU. */ diff --git a/Sources/Engine/Math/Functions.h b/Sources/Engine/Math/Functions.h index f0e8d03..1108e71 100755 --- a/Sources/Engine/Math/Functions.h +++ b/Sources/Engine/Math/Functions.h @@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul) // fast float to int conversion inline SLONG FloatToInt( FLOAT f) { -#if defined(__arm__) || defined(USE_PORTABLE_C) - // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG - float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5 - return((SLONG) (f + addToRound)); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) SLONG slRet; __asm { fld D [f] @@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f) ); return(slRet); #else - #error Fill this in for your platform. + // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG + float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5 + return((SLONG) (f + addToRound)); + #endif } // log base 2 of any float numero inline FLOAT Log2( FLOAT f) { -#if (defined USE_PORTABLE_C) || defined(__arm__) - return log2f(f); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) FLOAT fRet; _asm { fld1 @@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) { ); return(fRet); #else - #error Fill this in for your platform. + return log2f(f); + #endif } @@ -376,25 +372,7 @@ inline FLOAT Log2( FLOAT f) { // returns accurate values only for integers that are power of 2 inline SLONG FastLog2( SLONG x) { -#if (defined USE_PORTABLE_C) -#ifdef __GNUC__ - if(x == 0) return 0; // __builtin_clz() is undefined for 0 - int numLeadingZeros = __builtin_clz(x); - return 31 - numLeadingZeros; -#else - register SLONG val = x; - register SLONG retval = 31; - while (retval > 0) - { - if (val & (1 << retval)) - return retval; - retval--; - } - - return 0; -#endif - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) SLONG slRet; __asm { bsr eax,D [x] @@ -411,8 +389,21 @@ inline SLONG FastLog2( SLONG x) : "memory" ); return(slRet); +#elif (defined __GNUC__) + if(x == 0) return 0; // __builtin_clz() is undefined for 0 + int numLeadingZeros = __builtin_clz(x); + return 31 - numLeadingZeros; #else - #error Fill this in for your platform. + register SLONG val = x; + register SLONG retval = 31; + while (retval > 0) + { + if (val & (1 << retval)) + return retval; + retval--; + } + + return 0; #endif } @@ -420,11 +411,7 @@ inline SLONG FastLog2( SLONG x) // returns log2 of first larger value that is a power of 2 inline SLONG FastMaxLog2( SLONG x) { -#if (defined USE_PORTABLE_C) -printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); - return((SLONG) log2((double) x)); - -#elif (defined __MSVC_INLINE__) +#if (defined __MSVC_INLINE__) SLONG slRet; __asm { bsr eax,D [x] @@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); ); return(slRet); #else - #error Fill this in for your platform. +printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__); + return((SLONG) log2((double) x)); + #endif } */ diff --git a/Sources/Engine/Models/RenderModel_View.cpp b/Sources/Engine/Models/RenderModel_View.cpp index 6c574fa..73fb5ef 100644 --- a/Sources/Engine/Models/RenderModel_View.cpp +++ b/Sources/Engine/Models/RenderModel_View.cpp @@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc., #define W word ptr #define B byte ptr -#if (defined __MSVC_INLINE__) -#define ASMOPT 1 -#elif (defined __GNU_INLINE_X86_32__) -#define ASMOPT 0 // !!! FIXME: rcg10112001 Write GCC inline asm versions... -#else -#define ASMOPT 0 -#endif - extern BOOL CVA_bModels; extern BOOL GFX_bTruform; @@ -663,7 +655,7 @@ static FLOAT _fHazeAdd; // check vertex against fog static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { mov esi,D [vtx] mov edi,D [tex] @@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex) // check vertex against haze static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { mov esi,D [vtx] mov edi,D [tx1] @@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals) const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1; if( pFrame0==pFrame1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1196,7 +1188,7 @@ vtxNext16: // if lerping else { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1365,7 +1357,7 @@ vtxNext16L: // if no lerping if( pFrame0==pFrame1) { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // for each vertex in mip const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; @@ -1464,7 +1456,7 @@ vtxNext8: // if lerping else { -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8 SLONG slTmp1, slTmp2, slTmp3; // re-adjust stretching factors because of fixint lerping (divide by 256) @@ -1610,7 +1602,7 @@ vtxNext8L: } // generate colors from shades -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { pxor mm0,mm0 // construct 64-bit RGBA light @@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm) pvtxSrfBase = &_avtxSrfBase[iSrfVx0]; INDEX iSrfVx; -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) __asm { push ebx mov ebx,D [puwSrfToMip] @@ -2074,7 +2066,7 @@ srfVtxLoop: const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation); colSrfDiff.MultiplyRGBA( colD, colMdlDiff); -#if ASMOPT == 1 +#if (defined __MSVC_INLINE__) // setup texcoord array __asm { push ebx @@ -2134,7 +2126,7 @@ vtxEnd: for( INDEX iSrfVx=0; iSrfVxsl_SwfeFormat.nSamplesPerSec; // wipe destination mixer buffer - // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.) - #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - memset(pvMixerBuffer, 0, slMixerBufferSize * 8); - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld xor eax,eax @@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize) shl ecx,1 // *2 because of 32-bit src format rep stosd } - #elif (defined __GNU_INLINE_X86_32__) - // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()? - ULONG clob1, clob2; - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "stosl \n\t" - : "=D" (clob1), "=c" (clob2) - : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2) - : "cc", "memory" - ); #else - #error please write inline asm for your platform. + memset(pvMixerBuffer, 0, slMixerBufferSize * 8); #endif } @@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL ASSERT( slBytes%4==0); if( slBytes<4) return; - #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX)) - // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.) - memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes); - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld mov esi,D [slSrcOffset] @@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL shr ecx,2 // bytes to samples per channel rep movsd } - #elif (defined __GNU_INLINE_X86_32__) - // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()? - ULONG clob1, clob2, clob3; - __asm__ __volatile__ ( - "cld \n\t" - "rep \n\t" - "movsl \n\t" - : "=S" (clob1), "=D" (clob2), "=c" (clob3) - : "S" (((char *)pvMixerBuffer) + slSrcOffset), - "D" (pDstBuffer), - "c" (slBytes >> 2) - : "cc", "memory" - ); #else - #error please write inline asm for your platform. + memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes); #endif } @@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON ASSERT( slBytes%2==0); if( slBytes<4) return; - #if (defined USE_PORTABLE_C) - // (This is untested, currently. --ryan.) - WORD *dest = (WORD *) pDstBuffer; - WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset ); - SLONG max = slBytes / 4; - for (SLONG i = 0; i < max; i++) { - *dest = *src; - dest++; // move 16 bits. - src+=2; // move 32 bits. - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { mov esi,D [slSrcOffset] add esi,D [pvMixerBuffer] @@ -204,7 +160,15 @@ copyLoop: ); #else - #error please write inline asm for your platform. + // (This is untested, currently. --ryan.) + WORD *dest = (WORD *) pDstBuffer; + WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset ); + SLONG max = slBytes / 4; + for (SLONG i = 0; i < max; i++) { + *dest = *src; + dest++; // move 16 bits. + src+=2; // move 32 bits. + } #endif } @@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes) ASSERT( slBytes%4==0); if( slBytes<4) return; - #if (defined USE_PORTABLE_C) - //STUBBED("ConvertMixerBuffer"); - SWORD *dest = (SWORD *) pvMixerBuffer; - SLONG *src = (SLONG *) pvMixerBuffer; - SLONG max = slBytes / 2; - int tmp; - for (SLONG i = 0; i < max; i++) { - tmp = *src; - if (tmp>32767) tmp=32767; - if (tmp<-32767) tmp=-32767; - *dest=tmp; - dest++; // move 16 bits. - src++; // move 32 bits. - } - - - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { cld mov esi,D [pvMixerBuffer] @@ -271,7 +218,20 @@ copyLoop: ); #else - #error please write inline asm for your platform. + + SWORD *dest = (SWORD *) pvMixerBuffer; + SLONG *src = (SLONG *) pvMixerBuffer; + SLONG max = slBytes / 2; + int tmp; + for (SLONG i = 0; i < max; i++) { + tmp = *src; + if (tmp>32767) tmp=32767; + if (tmp<-32767) tmp=-32767; + *dest=tmp; + dest++; // move 16 bits. + src++; // move 32 bits. + } + #endif } @@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso) { _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); - #if (defined USE_PORTABLE_C) - // initialize some local vars - SLONG slLeftSample, slRightSample, slNextSample; - SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; - fixLeftOfs = (__int64)(fLeftOfs * 65536.0); - fixRightOfs = (__int64)(fRightOfs * 65536.0); - __int64 fixLeftStep = (__int64)(fLeftStep * 65536.0); - __int64 fixRightStep = (__int64)(fRightStep * 65536.0); - __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16; - mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor; - - SLONG slLeftVolume_ = slLeftVolume >> 16; - SLONG slRightVolume_ = slRightVolume >> 16; - - // loop thru source buffer - INDEX iCt = slMixerBufferSize; - FOREVER - { - // if left channel source sample came to end of sample buffer - if( fixLeftOfs >= fixSoundBufferSize) { - fixLeftOfs -= fixSoundBufferSize; - // if has no loop, end it - bEndOfSound = bNotLoop; - } - // if right channel source sample came to end of sample buffer - if( fixRightOfs >= fixSoundBufferSize) { - fixRightOfs -= fixSoundBufferSize; - // if has no loop, end it - bEndOfSound = bNotLoop; - } - // end of buffer? - if( iCt<=0 || bEndOfSound) break; - - // fetch one lineary interpolated sample on left channel - slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0]; - slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1]; - slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; - // fetch one lineary interpolated sample on right channel - slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0]; - slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1]; - slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; - - // filter samples - slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15; - slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15; - - // apply stereo volume to current sample - slLeftSample = (slLastLeftSample * slLeftVolume_) >>15; - slRightSample = (slLastRightSample * slRightVolume_)>>15; - - slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF); - slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF); - - // mix in current sample - slLeftSample += pslDstBuffer[0]; - slRightSample += pslDstBuffer[1]; - // upper clamp - if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD; - if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD; - // lower clamp - if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD; - if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD; - - // store samples (both channels) - pslDstBuffer[0] = slLeftSample; - pslDstBuffer[1] = slRightSample; - - // modify volume ` - slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF); - slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF); - - // advance to next sample - fixLeftOfs += fixLeftStep; - fixRightOfs += fixRightStep; - pslDstBuffer += 2; - iCt--; - } - - #elif (defined __MSVC_INLINE__) + #if (defined __MSVC_INLINE__) __asm { // convert from floats to fixints 32:16 fld D [fLeftOfs] @@ -553,19 +435,6 @@ loopEnd: MixMono_asm(pso); #else - #error please write inline asm for your platform. - #endif - - _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER); -} - - -// mixes one stereo 16-bit signed sound to destination buffer -inline void MixStereo( CSoundObject *pso) -{ - _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); - - #if (defined USE_PORTABLE_C) // initialize some local vars SLONG slLeftSample, slRightSample, slNextSample; SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; @@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso) if( iCt<=0 || bEndOfSound) break; // fetch one lineary interpolated sample on left channel - slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0]; - slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2]; + slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0]; + slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1]; slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; // fetch one lineary interpolated sample on right channel - slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0]; - slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2]; + slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0]; + slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1]; slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; // filter samples @@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso) iCt--; } - #elif (defined __MSVC_INLINE__) + #endif + + _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER); +} + + +// mixes one stereo 16-bit signed sound to destination buffer +inline void MixStereo( CSoundObject *pso) +{ + _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER); + + #if (defined __MSVC_INLINE__) __asm { // convert from floats to fixints 32:16 fld D [fLeftOfs] @@ -783,7 +663,83 @@ loopEnd: MixStereo_asm(pso); #else - #error please write inline asm for your platform. + // initialize some local vars + SLONG slLeftSample, slRightSample, slNextSample; + SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer; + fixLeftOfs = (__int64)(fLeftOfs * 65536.0); + fixRightOfs = (__int64)(fRightOfs * 65536.0); + __int64 fixLeftStep = (__int64)(fLeftStep * 65536.0); + __int64 fixRightStep = (__int64)(fRightStep * 65536.0); + __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16; + mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor; + + SLONG slLeftVolume_ = slLeftVolume >> 16; + SLONG slRightVolume_ = slRightVolume >> 16; + + // loop thru source buffer + INDEX iCt = slMixerBufferSize; + FOREVER + { + // if left channel source sample came to end of sample buffer + if( fixLeftOfs >= fixSoundBufferSize) { + fixLeftOfs -= fixSoundBufferSize; + // if has no loop, end it + bEndOfSound = bNotLoop; + } + // if right channel source sample came to end of sample buffer + if( fixRightOfs >= fixSoundBufferSize) { + fixRightOfs -= fixSoundBufferSize; + // if has no loop, end it + bEndOfSound = bNotLoop; + } + // end of buffer? + if( iCt<=0 || bEndOfSound) break; + + // fetch one lineary interpolated sample on left channel + slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0]; + slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2]; + slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16; + // fetch one lineary interpolated sample on right channel + slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0]; + slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2]; + slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16; + + // filter samples + slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15; + slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15; + + // apply stereo volume to current sample + slLeftSample = (slLastLeftSample * slLeftVolume_) >>15; + slRightSample = (slLastRightSample * slRightVolume_)>>15; + + slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF); + slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF); + + // mix in current sample + slLeftSample += pslDstBuffer[0]; + slRightSample += pslDstBuffer[1]; + // upper clamp + if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD; + if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD; + // lower clamp + if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD; + if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD; + + // store samples (both channels) + pslDstBuffer[0] = slLeftSample; + pslDstBuffer[1] = slRightSample; + + // modify volume ` + slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF); + slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF); + + // advance to next sample + fixLeftOfs += fixLeftStep; + fixRightOfs += fixRightStep; + pslDstBuffer += 2; + iCt--; + } + #endif _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);