Merge pull request #47 from notaz/asm_rework

Asm rework
2025-09-18 18:30:05 +02:00 · 2016-05-09 12:03:16 -04:00 · 2016-05-09 12:03:16 -04:00 · a8c6c77309
commit a8c6c77309
parent 056d77f479 ce46bd1e99
24 changed files with 926 additions and 1142 deletions
--- a/Sources/CMakeLists.txt
+++ b/Sources/CMakeLists.txt
@ -188,13 +188,20 @@ else()
    set(DEBUGSUFFIX "")
 endif()
-# This should not be needed anymore, but might be faster on 32bit x86
+option(USE_ASM "Use ASM code" TRUE)
-option(USE_I386_ASM "Use X86 ASM" FALSE)
+if (USE_ASM)
    MESSAGE(STATUS "Using assembler code (when available)")
 else()
    add_definitions(-DUSE_PORTABLE_C=1)
    MESSAGE(STATUS "Using portable C instead of all ASM")
 endif()
-if (USE_I386_ASM)
+option(USE_I386_NASM_ASM "Use i386 nasm ASM code" FALSE)
 if (USE_ASM AND USE_I386_NASM_ASM)
    # You need the Netwide Assembler (NASM) to build this on Intel systems.
    #   http://nasm.sf.net/
-    add_definitions(-DUSE_I386_ASM=1)
+    add_definitions(-DUSE_I386_NASM_ASM=1)
    if (MACOSX)
        set(ASMOBJFMT "macho")
        list(APPEND ASMFLAGS --prefix _)
@ -203,10 +210,9 @@ if (USE_I386_ASM)
    else()
        set(ASMOBJFMT "elf")
    endif()
-    MESSAGE(STATUS "Using i386 assembler")
+    MESSAGE(STATUS "Using i386 nasm ASM")
 else()
-    add_definitions(-DUSE_PORTABLE_C=1)
+    MESSAGE(STATUS "Not using i386 nasm ASM")
    MESSAGE(STATUS "Using portable C instead of ASM")
 endif()
 option(PANDORA "Compile for Pandora" FALSE)
@ -655,7 +661,7 @@ add_dependencies(${SHADERSLIB} ParseEntities)
 add_parser_and_scanner("Engine/Base/Parser" "Engine/Base/Scanner")
 add_parser_and_scanner("Engine/Ska/smcPars" "Engine/Ska/smcScan")
-if (USE_I386_ASM)
+if (USE_I386_NASM_ASM)
    add_custom_command(
        OUTPUT "SoundMixer386.o"
        MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/Engine/Sound/SoundMixer386.asm"
--- a/Sources/Engine/Base/Base.h
+++ b/Sources/Engine/Base/Base.h
@ -65,8 +65,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #else
  #warning "UNKNOWN PLATFORM IDENTIFIED!!!!"
  #define PLATFORM_UNKNOWN 1
  #warning "USING PORTABLE C!!!"
  #define USE_PORTABLE_C
 #endif
 #if PLATFORM_LINUX || PLATFORM_MACOSX
--- a/Sources/Engine/Base/Profiling.cpp
+++ b/Sources/Engine/Base/Profiling.cpp
@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 template class CStaticArray<CProfileCounter>;
 template class CStaticArray<CProfileTimer>;
-#if (defined USE_PORTABLE_C)
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #include <sys/time.h>
 #endif
 static inline __int64 ReadTSC_profile(void)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  #ifdef __arm__
  struct timespec tv;
  clock_gettime(CLOCK_MONOTONIC, &tv);
  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
  #else
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
  #endif
 #elif (defined __MSVC_INLINE__)
  __int64 mmRet;
  __asm {
    rdtsc
@ -47,7 +36,7 @@ static inline __int64 ReadTSC_profile(void)
  }
  return mmRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __int64 mmRet;
  __asm__ __volatile__ (
    "rdtsc                    \n\t"
@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void)
  return(mmRet);
 #else
-  #error Please implement for your platform/compiler.
+  #ifdef __arm__
  struct timespec tv;
  clock_gettime(CLOCK_MONOTONIC, &tv);
  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
  #else
  struct timeval tv;
  gettimeofday(&tv, NULL);
  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
  #endif
 #endif
 }
--- a/Sources/Engine/Base/Timer.cpp
+++ b/Sources/Engine/Base/Timer.cpp
@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #include <Engine/Base/Priority.inl>
 // !!! FIXME: use SDL timer code instead and rdtsc never?
-#if (USE_PORTABLE_C) 
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #define USE_GETTIMEOFDAY 1
 #endif
@ -64,7 +64,7 @@ static inline __int64 ReadTSC(void)
  }
  return mmRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __int64 mmRet;
  __asm__ __volatile__ (
    "rdtsc                    \n\t"
--- a/Sources/Engine/Base/Types.h
+++ b/Sources/Engine/Base/Types.h
@ -109,6 +109,30 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
  #define ASMSYM(x) #x
 #endif
 /* should we enable inline asm? */
 #ifndef USE_PORTABLE_C
  #if defined(__MSVC_INLINE__)
    /* the build system selected __MSVC_INLINE__ */
  #elif defined(__GNU_INLINE_X86_32__)
    /* the build system selected __GNU_INLINE_X86_32__ */
  #elif defined(_MSC_VER) && defined(_M_IX86)
    #define __MSVC_INLINE__
  #elif defined (__GNUC__) && defined(__i386)
    #define __GNU_INLINE_X86_32__
  #elif defined (__GNUC__) && defined(__x86_64__)
    #define __GNU_INLINE_X86_64__
  #endif
  #if defined(__GNU_INLINE_X86_32__) || defined(__GNU_INLINE_X86_64__)
    #define __GNU_INLINE_X86__
  #endif
  #if defined(__GNU_INLINE_X86__)
    #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
    #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
  #endif
 #endif
 #ifdef PLATFORM_UNIX  /* rcg10042001 */
    #include <stdio.h>
    #include <string.h>
@ -134,25 +158,6 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
      #endif
    #endif
    #if ((defined __GNUC__) && (!defined __GNU_INLINE__))
      #define __GNU_INLINE__
    #endif
    #if (defined __INTEL_COMPILER)
      #if ((!defined __GNU_INLINE__) && (!defined __MSVC_INLINE__))
        #error Please define __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++.
      #endif
      #if ((defined __GNU_INLINE__) && (defined __MSVC_INLINE__))
        #error Define either __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++.
      #endif
    #endif
    #if defined(__GNU_INLINE__) && defined(__i386__)
      #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
      #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
    #endif
    #ifndef PAGESIZE
      #define PAGESIZE 4096
    #endif
@ -230,10 +235,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
    inline ULONG _rotl(ULONG ul, int bits)
    {
-        #if (defined USE_PORTABLE_C)
+        #if (defined __GNU_INLINE_X86_32__)
            // DG: according to http://blog.regehr.org/archives/1063 this is fast
            return (ul<<bits) | (ul>>(-bits&31));
        #elif (defined __GNU_INLINE__)
            // This, on the other hand, is wicked fast.  :)
            __asm__ __volatile__ (
                "roll %%cl, %%eax    \n\t"
@ -255,7 +257,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
            return(ul);
        #else
-            #error need inline asm for your platform.
+            // DG: according to http://blog.regehr.org/archives/1063 this is fast
            return (ul<<bits) | (ul>>(-bits&31));
        #endif
    }
--- a/Sources/Engine/Engine.cpp
+++ b/Sources/Engine/Engine.cpp
@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD  ul_reason_for_call, LPVOID lpReser
 static void DetectCPU(void)
 {
-#if (defined USE_PORTABLE_C)  // rcg10072001
+  char strVendor[12+1] = { 0 };
  CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
 #else
  char strVendor[12+1];
  strVendor[12] = 0;
-  ULONG ulTFMS;
+  ULONG ulTFMS = 0;
-  ULONG ulFeatures;
+  ULONG ulFeatures = 0;
  #if (defined __MSVC_INLINE__)
  // test MMX presence and update flag
@ -148,43 +144,47 @@ static void DetectCPU(void)
    mov     dword ptr [ulFeatures], edx
  }
-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86__)
    ULONG eax, ebx, ecx, edx;
    // test MMX presence and update flag
    __asm__ __volatile__ (
-        "pushl   %%ebx            \n\t"
+    #if (defined __GNU_INLINE_X86_64__)
        "xorl    %%eax,%%eax      \n\t"  // request for basic id
        "cpuid                    \n\t"
-        "movl    %%ebx,  (%%esi)  \n\t"
+            : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-        "movl    %%edx, 4(%%esi)  \n\t"
+    #else
-        "movl    %%ecx, 8(%%esi)  \n\t"
+        "movl    %%ebx, %%esi     \n\t"
-        "popl    %%ebx            \n\t"
+        "cpuid                    \n\t"
-            : // no specific outputs.
+        "xchgl   %%ebx, %%esi     \n\t"
-            : "S" (strVendor)
+            : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
-            : "eax", "ecx", "edx", "memory"
+    #endif
            : "a" (0) // request for basic id
    );
-
+    memcpy(strVendor + 0, &ebx, 4);
-        // need to break this into a separate asm block, since I'm clobbering
+    memcpy(strVendor + 4, &edx, 4);
-        //  too many registers. There's something to be said for letting MSVC
+    memcpy(strVendor + 8, &ecx, 4);
        //  figure out where on the stack your locals are resting, but yeah,
        //  I know, that's x86-specific anyhow...
        // !!! FIXME: can probably do this right with modern GCC.
    __asm__ __volatile__ (
-        "pushl   %%ebx                  \n\t"
+    #if (defined __GNU_INLINE_X86_64__)
-        "movl    $1, %%eax              \n\t"  // request for TFMS feature flags
+        "cpuid                    \n\t"
-        "cpuid                          \n\t"
+            : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
-        "mov     %%eax, (%%esi)         \n\t"  // remember type, family, model and stepping
+    #else
-        "mov     %%edx, (%%edi)         \n\t"
+        "movl    %%ebx, %%esi     \n\t"
-        "popl    %%ebx                  \n\t"
+        "cpuid                    \n\t"
-            : // no specific outputs.
+        "xchgl   %%ebx, %%esi     \n\t"
-            : "S" (&ulTFMS), "D" (&ulFeatures)
+            : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
-            : "eax", "ecx", "edx", "memory"
+    #endif
            : "a" (1) // request for TFMS feature flags
    );
    ulTFMS = eax;
    ulFeatures = edx;
  #else
    #error Please implement for your platform or define USE_PORTABLE_C.
  #endif
  if (ulTFMS == 0) {
    CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
    return;
  }
  INDEX iType     = (ulTFMS>>12)&0x3;
  INDEX iFamily   = (ulTFMS>> 8)&0xF;
  INDEX iModel    = (ulTFMS>> 4)&0xF;
@ -215,8 +215,6 @@ static void DetectCPU(void)
  sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6);
  if( !bMMX) FatalError( TRANS("MMX support required but not present!"));
 #endif  // defined USE_PORTABLE_C
 }
 static void DetectCPUWrapper(void)
--- a/Sources/Engine/Graphics/Color.cpp
+++ b/Sources/Engine/Graphics/Color.cpp
@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
  if( col2==0xFFFFFFFF)   return col1;
  if( col1==0 || col2==0) return 0;
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  // !!! FIXME: This...is not fast.
  union
  {
    COLOR col;
    UBYTE bytes[4];
  } conv1;
  union
  {
    COLOR col;
    UBYTE bytes[4];
  } conv2;
  conv1.col = col1;
  conv2.col = col2;
  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
  return(conv1.col);
 #elif (defined __MSVC_INLINE__)
  COLOR colRet;
  __asm {
    xor     ebx,ebx
@ -347,7 +324,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
  }
  return colRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  COLOR colRet;
  __asm__ __volatile__ (
    "pushl     %%ebx                \n\t"
@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2)
  return colRet;
 #else
  #error please fill in inline assembly for your platform.
 #endif
 }
 // fast color additon function - RES = clamp (1ST + 2ND)
 COLOR AddColors( COLOR col1, COLOR col2) 
 {
  if( col1==0) return col2;
  if( col2==0) return col1;
  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
  COLOR colRet;
 #if (defined USE_PORTABLE_C)
  // !!! FIXME: This...is not fast.
  union
  {
@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2)
    COLOR col;
    UBYTE bytes[4];
  } conv2;
  #define MINVAL(a, b) ((a)>(b))?(b):(a)
  conv1.col = col1;
  conv2.col = col2;
-  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
+  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
-  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
+  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
-  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
+  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
-  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
+  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
  #undef MINVAL
-  colRet = conv1.col;
+  return(conv1.col);
 #endif
 }
-#elif (defined __MSVC_INLINE__)
+
 // fast color additon function - RES = clamp (1ST + 2ND)
 COLOR AddColors( COLOR col1, COLOR col2) 
 {
  if( col1==0) return col2;
  if( col2==0) return col1;
  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
  COLOR colRet;
 #if (defined __MSVC_INLINE__)
  __asm {
    xor     ebx,ebx
    mov     esi,255
@ -535,7 +507,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
    mov     D [colRet],ebx
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp;
  __asm__ __volatile__ (
    // if xbx is "r", gcc runs out of regs in -fPIC + -fno-omit-fp :(
@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2)
  );
 #else
-  #error please fill in inline assembly for your platform.
+  // !!! FIXME: This...is not fast.
  union
  {
    COLOR col;
    UBYTE bytes[4];
  } conv1;
  union
  {
    COLOR col;
    UBYTE bytes[4];
  } conv2;
  #define MINVAL(a, b) ((a)>(b))?(b):(a)
  conv1.col = col1;
  conv2.col = col2;
  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
  #undef MINVAL
  colRet = conv1.col;
 #endif
  return colRet;
@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
 // multiple conversion from OpenGL color to DirectX color
 extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  //#error write me.
  for (int i=0; i<ct; i++) {
    ULONG tmp = pulSrc[i];
    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
  }
 #elif (defined __MSVC_INLINE__)
  __asm {
    mov   esi,dword ptr [pulSrc]
    mov   edi,dword ptr [pulDst]
@ -678,12 +665,12 @@ colSkip2:
    mov   dword ptr [edi],eax
 colSkip1:
  }
 #elif (defined __GNU_INLINE__)
  STUBBED("convert to inline asm.");
 #else
-  #error please fill in inline assembly for your platform.
+  for (int i=0; i<ct; i++) {
    ULONG tmp = pulSrc[i];
    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
  }
 #endif
 }
--- a/Sources/Engine/Graphics/Color.h
+++ b/Sources/Engine/Graphics/Color.h
@ -204,19 +204,7 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito
 __forceinline ULONG ByteSwap( ULONG ul)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 	ul = ( ((ul << 24)            ) |
           ((ul << 8) & 0x00FF0000) |
           ((ul >> 8) & 0x0000FF00) |
           ((ul >> 24)            ) );
    #if (defined PLATFORM_BIGENDIAN)
    BYTESWAP(ul);  // !!! FIXME: May not be right!
    #endif
    return(ul);
 #elif (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [ul]
@ -225,7 +213,7 @@ __forceinline ULONG ByteSwap( ULONG ul)
  }
  return ulRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "bswapl   %%eax    \n\t"
        : "=a" (ul)
@ -234,16 +222,22 @@ __forceinline ULONG ByteSwap( ULONG ul)
  return(ul);
 #else
-  #error please define for your platform.
+  ul = ( ((ul << 24)            ) |
         ((ul << 8) & 0x00FF0000) |
         ((ul >> 8) & 0x0000FF00) |
         ((ul >> 24)            ) );
  #if (defined PLATFORM_BIGENDIAN)
  BYTESWAP(ul);  // !!! FIXME: May not be right!
  #endif
  return(ul);
 #endif
 }
 __forceinline ULONG rgba2argb( ULONG ul)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 	return( (ul << 24) | (ul >> 8) );
 #elif (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [ul]
@ -252,7 +246,7 @@ __forceinline ULONG rgba2argb( ULONG ul)
  }
  return ulRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulRet;
  __asm__ __volatile__ (
    "rorl   $8, %%eax       \n\t"
@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul)
  return ulRet;
 #else
-  #error please define for your platform.
+  return (ul << 24) | (ul >> 8);
 #endif
 }
 __forceinline ULONG abgr2argb( COLOR col)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 	// this could be simplified, this is just a safe conversion from asm code
 	col = ( ((col << 24)            ) |
            ((col << 8) & 0x00FF0000) |
            ((col >> 8) & 0x0000FF00) |
            ((col >> 24)            ) );
 	return( (col << 24) | (col >> 8) );
 #elif (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [col]
@ -287,7 +274,7 @@ __forceinline ULONG abgr2argb( COLOR col)
  }
  return ulRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulRet;
  __asm__ __volatile__ (
    "bswapl %%eax           \n\t"
@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col)
  return ulRet;
 #else
-  #error please define for your platform.
+  // this could be simplified, this is just a safe conversion from asm code
  col = ( ((col << 24)            ) |
          ((col << 8) & 0x00FF0000) |
          ((col >> 8) & 0x0000FF00) |
          ((col >> 24)            ) );
  return( (col << 24) | (col >> 8) );
 #endif
 }
@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct);
 // fast memory copy of ULONGs
 inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 {
-#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
+#if (defined __MSVC_INLINE__)
  memcpy( pulDst, pulSrc, ctLongs*4);
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   esi,dword ptr [pulSrc]
@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
    mov   ecx,dword ptr [ctLongs]
    rep   movsd
  }
 #elif (defined __GNU_INLINE__)
    // I haven't benchmarked it, but in many cases, memcpy() becomes an
    //  inline (asm?) macro on GNU platforms, so this might not be a
    //  speed gain at all over the USE_PORTABLE_C version.
    // You Have Been Warned. --ryan.
  __asm__ __volatile__ (
    "cld    \n\t"
    "rep    \n\t"
    "movsd  \n\t"
        : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs)
        : "S" (pulSrc), "D" (pulDst), "c" (ctLongs)
        : "cc", "memory"
  );
 #else
-# error Please fill this in for your platform.
+  memcpy( pulDst, pulSrc, ctLongs*4);
 #endif
 }
@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 // fast memory set of ULONGs
 inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  for( INDEX i=0; i<ctLongs; i++)
    pulDst[i] = ulVal;
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   eax,dword ptr [ulVal]
@ -359,7 +330,7 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
    rep   stosd
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "cld    \n\t"
    "rep    \n\t"
@ -370,7 +341,9 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
  );
 #else
-# error Please fill this in for your platform.
+  for( INDEX i=0; i<ctLongs; i++)
    pulDst[i] = ulVal;
 #endif
 }
--- a/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
+++ b/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
@ -38,16 +38,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr
 #if (defined USE_PORTABLE_C)
 #define ASMOPT 0
 #elif (defined __MSVC_INLINE__)
 #define ASMOPT 1
 #elif (defined __GNU_INLINE__)
 #define ASMOPT 1
 #else
 #define ASMOPT 0
 #endif
 #define MAXTEXUNITS   4
 #define SHADOWTEXTURE 3
@ -153,8 +143,7 @@ void AddElements( ScenePolygon *pspo)
  const INDEX ctElems = pspo->spo_ctElements;
  INDEX *piDst = _aiElements.Push(ctElems);
-#if (ASMOPT == 1)
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,D [pspo]
    mov     ecx,D [ctElems]
@ -184,7 +173,7 @@ elemRest:
    mov     D [edi],eax
 elemDone:
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[ctElems], %%ecx      \n\t"
    "movl    %[piDst], %%edi        \n\t"
@ -219,11 +208,6 @@ elemDone:
          "cc", "memory"
  );
 #else
   #error Please write inline ASM for your platform.
 #endif
 #else
  const INDEX iVtx0Pass = pspo->spo_iVtx0Pass;
  const INDEX *piSrc = pspo->spo_piElements;
@ -495,9 +479,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
  // determine maximum used groups
  ASSERT( _ctGroupsCount);
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,2
    bsr     ecx,D [_ctGroupsCount]
@ -505,7 +487,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
    mov     D [_ctGroupsCount],eax
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     $2, %%eax          \n\t"
    "bsrl     (%%esi), %%ecx     \n\t"
@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
        : "eax", "ecx", "cc", "memory"
  );
 #else
   #error Please write inline ASM for your platform.
 #endif
 #else
  // emulate x86's bsr opcode...not fast.  :/
  register DWORD val = _ctGroupsCount;
@ -858,10 +835,7 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn
      continue;
    }
-// !!! FIXME: rcg11232001 This inline conversion is broken. Use the
+#if (defined __MSVC_INLINE__)
 // !!! FIXME: rcg11232001  C version for now with GCC.
 #if ((ASMOPT == 1) && (!defined __GNU_INLINE__) && (!defined __INTEL_COMPILER))
  #if (defined __MSVC_INLINE__)
    __asm {
      mov     esi,D [pspo]
      mov     edi,D [iMappingOffset]
@ -915,7 +889,7 @@ vtxLoop:
 /*
    // !!! FIXME: rcg11232001 This inline conversion is broken. Use the
    // !!! FIXME: rcg11232001  C version for now on Linux.
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
    STUBBED("debug this");
    __asm__ __volatile__ (
      "0:                                  \n\t" // vtxLoop
@ -956,11 +930,6 @@ vtxLoop:
    );
 */
 #else
   #error Please write inline ASM for your platform.
 #endif
 #else
    // diffuse mapping
--- a/Sources/Engine/Graphics/Fog.cpp
+++ b/Sources/Engine/Graphics/Fog.cpp
@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ)
  // need to upload from RGBA format
  const PIX pixTextureSize = pixSizeI*pixSizeJ;
- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
   const UBYTE* src = pubTexture;
   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
   for (int i=0; i<pixTextureSize; i++) {
    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
    src++;
    dst++;
   }
 #elif (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pubTexture]
    mov     edi,D [pubTexture]
@ -95,7 +84,7 @@ pixLoop:
    jnz     pixLoop
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[pubTexture], %%esi      \n\t"
    "movl    %[pixTextureSize], %%ecx  \n\t"
@ -115,10 +104,18 @@ pixLoop:
        : "eax", "ecx", "esi", "edi", "cc", "memory"
  );
- #else
+#else
-   #error Write inline ASM for your platform.
+   const UBYTE* src = pubTexture;
   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
   for (int i=0; i<pixTextureSize; i++) {
    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
    src++;
    dst++;
   }
- #endif
+#endif
  // determine internal format
  extern INDEX gap_bAllowGrayTextures;
--- a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
+++ b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      if( pixSizeV==0) pixSizeV=1;
      pixSize = pixSizeU*pixSizeV;
-      #if (defined USE_PORTABLE_C)
+      #if (defined __MSVC_INLINE__)
      // Basically average every other pixel...
      UWORD w = 0;
      UBYTE *dptr = (UBYTE *) pulDst;
      UBYTE *sptr = (UBYTE *) pulSrc;
      #if 0
      pixSize *= 4;
      for (PIX i = 0; i < pixSize; i++)
      {
        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
        dptr++;
        sptr += 2;
      }
      #else
      for (PIX i = 0; i < pixSize; i++)
      {
        for (PIX j = 0; j < 4; j++)
        {
          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
          dptr++;
          sptr++;
        }
        sptr += 4;
      }
      #endif
      #elif (defined __MSVC_INLINE__)
      __asm {   
        pxor    mm0,mm0
        mov     esi,D [pulSrc]
@ -216,7 +191,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
        emms
      }
-      #elif (defined __GNU_INLINE__)
+      #elif (defined __GNU_INLINE_X86_32__)
      __asm__ __volatile__ (
        "pxor      %%mm0,%%mm0                \n\t"
        "movl      %[pulSrc],%%esi            \n\t"
@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      );
      #else
-      #error Please write inline ASM for your platform.
+      // Basically average every other pixel...
      UWORD w = 0;
      UBYTE *dptr = (UBYTE *) pulDst;
      UBYTE *sptr = (UBYTE *) pulSrc;
      #if 0
      pixSize *= 4;
      for (PIX i = 0; i < pixSize; i++)
      {
        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
        dptr++;
        sptr += 2;
      }
      #else
      for (PIX i = 0; i < pixSize; i++)
      {
        for (PIX j = 0; j < 4; j++)
        {
          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
          dptr++;
          sptr++;
        }
        sptr += 4;
      }
      #endif
      #endif
      // upload mipmap
--- a/Sources/Engine/Graphics/Graphics.cpp
+++ b/Sources/Engine/Graphics/Graphics.cpp
@ -209,7 +209,92 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
  if( bBilinear) // type of filtering?
  { // BILINEAR
-   #if (defined USE_PORTABLE_C)
+   #if (defined __MSVC_INLINE__)
    __asm {
      pxor    mm0,mm0
      mov     ebx,D [pixWidth]
      mov     esi,D [pulSrcMipmap]
      mov     edi,D [pulDstMipmap]
      mov     edx,D [pixHeight]
 rowLoop:
      mov     ecx,D [pixWidth]
 pixLoopN:           
      movd    mm1,D [esi+ 0]        // up-left
      movd    mm2,D [esi+ 4]        // up-right
      movd    mm3,D [esi+ ebx*8 +0] // down-left
      movd    mm4,D [esi+ ebx*8 +4] // down-right
      punpcklbw mm1,mm0
      punpcklbw mm2,mm0
      punpcklbw mm3,mm0
      punpcklbw mm4,mm0
      paddw   mm1,mm2
      paddw   mm1,mm3
      paddw   mm1,mm4
      paddw   mm1,Q [mmRounder]
      psrlw   mm1,2
      packuswb mm1,mm0
      movd    D [edi],mm1
      // advance to next pixel
      add     esi,4*2
      add     edi,4
      dec     ecx
      jnz     pixLoopN
      // advance to next row
      lea     esi,[esi+ ebx*8] // skip one row in source mip-map
      dec     edx
      jnz     rowLoop
      emms
    }
   #elif (defined __GNU_INLINE_X86_32__)
    __asm__ __volatile__ (
      "pxor    %%mm0, %%mm0                 \n\t"
      "movl    %[pulSrcMipmap], %%esi       \n\t"
      "movl    %[pulDstMipmap], %%edi       \n\t"
      "movl    %[pixHeight], %%edx          \n\t"
      "0:                                   \n\t"  // rowLoop
      "movl    %[pixWidth], %%ecx           \n\t"
      "1:                                   \n\t"  // pixLoopN
      "movd      0(%%esi), %%mm1            \n\t"  // up-left
      "movd      4(%%esi), %%mm2            \n\t"  // up-right
      "movd      0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left
      "movd      4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right
      "punpcklbw %%mm0, %%mm1               \n\t"
      "punpcklbw %%mm0, %%mm2               \n\t"
      "punpcklbw %%mm0, %%mm3               \n\t"
      "punpcklbw %%mm0, %%mm4               \n\t"
      "paddw     %%mm2, %%mm1               \n\t"
      "paddw     %%mm3, %%mm1               \n\t"
      "paddw     %%mm4, %%mm1               \n\t"
      "paddw     (" ASMSYM(mmRounder) "), %%mm1 \n\t"
      "psrlw     $2, %%mm1                  \n\t"
      "packuswb  %%mm0, %%mm1               \n\t"
      "movd      %%mm1, (%%edi)             \n\t"
      // advance to next pixel
      "addl     $8, %%esi                   \n\t"
      "addl     $4, %%edi                   \n\t"
      "decl     %%ecx                       \n\t"
      "jnz      1b                          \n\t"  // pixLoopN
      // advance to next row
      // skip one row in source mip-map
      "leal     0(%%esi, %[pixWidth], 8), %%esi \n\t"
      "decl     %%edx                       \n\t"
      "jnz      0b                          \n\t"  // rowLoop
      "emms                                 \n\t"
          : // no outputs.
          : [pixWidth] "r" (pixWidth),
            [pulSrcMipmap] "g" (pulSrcMipmap),
            [pulDstMipmap] "g" (pulDstMipmap),
            [pixHeight] "g" (pixHeight)
          : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi",
            "cc", "memory"
    );
   #else
 	UBYTE *src = (UBYTE *) pulSrcMipmap;
 	UBYTE *dest = (UBYTE *) pulDstMipmap;
 	for (int i = 0 ; i < pixHeight; i++)
@ -260,129 +345,13 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
 		src += 8*pixWidth;
    }
   #elif (defined __MSVC_INLINE__)
    __asm {
      pxor    mm0,mm0
      mov     ebx,D [pixWidth]
      mov     esi,D [pulSrcMipmap]
      mov     edi,D [pulDstMipmap]
      mov     edx,D [pixHeight]
 rowLoop:
      mov     ecx,D [pixWidth]
 pixLoopN:           
      movd    mm1,D [esi+ 0]        // up-left
      movd    mm2,D [esi+ 4]        // up-right
      movd    mm3,D [esi+ ebx*8 +0] // down-left
      movd    mm4,D [esi+ ebx*8 +4] // down-right
      punpcklbw mm1,mm0
      punpcklbw mm2,mm0
      punpcklbw mm3,mm0
      punpcklbw mm4,mm0
      paddw   mm1,mm2
      paddw   mm1,mm3
      paddw   mm1,mm4
      paddw   mm1,Q [mmRounder]
      psrlw   mm1,2
      packuswb mm1,mm0
      movd    D [edi],mm1
      // advance to next pixel
      add     esi,4*2
      add     edi,4
      dec     ecx
      jnz     pixLoopN
      // advance to next row
      lea     esi,[esi+ ebx*8] // skip one row in source mip-map
      dec     edx
      jnz     rowLoop
      emms
    }
   #elif (defined __GNU_INLINE__)
    __asm__ __volatile__ (
      "pxor    %%mm0, %%mm0                 \n\t"
      "movl    %[pulSrcMipmap], %%esi       \n\t"
      "movl    %[pulDstMipmap], %%edi       \n\t"
      "movl    %[pixHeight], %%edx          \n\t"
      "0:                                   \n\t"  // rowLoop
      "movl    %[pixWidth], %%ecx           \n\t"
      "1:                                   \n\t"  // pixLoopN
      "movd      0(%%esi), %%mm1            \n\t"  // up-left
      "movd      4(%%esi), %%mm2            \n\t"  // up-right
      "movd      0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left
      "movd      4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right
      "punpcklbw %%mm0, %%mm1               \n\t"
      "punpcklbw %%mm0, %%mm2               \n\t"
      "punpcklbw %%mm0, %%mm3               \n\t"
      "punpcklbw %%mm0, %%mm4               \n\t"
      "paddw     %%mm2, %%mm1               \n\t"
      "paddw     %%mm3, %%mm1               \n\t"
      "paddw     %%mm4, %%mm1               \n\t"
      "paddw     (" ASMSYM(mmRounder) "), %%mm1 \n\t"
      "psrlw     $2, %%mm1                  \n\t"
      "packuswb  %%mm0, %%mm1               \n\t"
      "movd      %%mm1, (%%edi)             \n\t"
      // advance to next pixel
      "addl     $8, %%esi                   \n\t"
      "addl     $4, %%edi                   \n\t"
      "decl     %%ecx                       \n\t"
      "jnz      1b                          \n\t"  // pixLoopN
      // advance to next row
      // skip one row in source mip-map
      "leal     0(%%esi, %[pixWidth], 8), %%esi \n\t"
      "decl     %%edx                       \n\t"
      "jnz      0b                          \n\t"  // rowLoop
      "emms                                 \n\t"
          : // no outputs.
          : [pixWidth] "r" (pixWidth),
            [pulSrcMipmap] "g" (pulSrcMipmap),
            [pulDstMipmap] "g" (pulDstMipmap),
            [pixHeight] "g" (pixHeight)
          : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi",
            "cc", "memory"
    );
   #else
     #error Write inline asm for your platform.
   #endif
    }
    else
    { // NEAREST-NEIGHBOUR but with border preserving
       ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL;
-   #if (defined USE_PORTABLE_C)
+   #if (defined __MSVC_INLINE__)
     PIX offset = 0;
     ulRowModulo /= 4;
     for (int q = 0; q < 2; q++)
     {
         for (PIX i = pixHeight / 2; i > 0; i--)
         {
             for (PIX j = pixWidth / 2; j > 0; j--)
             {
                 *pulDstMipmap = *(pulSrcMipmap + offset);
                 pulSrcMipmap += 2;
                 pulDstMipmap++;
             }
             for (PIX j = pixWidth / 2; j > 0; j--)
             {
                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
                 pulSrcMipmap += 2;
                 pulDstMipmap++;
             }
             pulSrcMipmap += ulRowModulo;
        }
        offset = pixWidth * 2;
     }
   #elif (defined __MSVC_INLINE__)
    __asm {
      xor     ebx,ebx
      mov     esi,D [pulSrcMipmap]
@ -428,7 +397,7 @@ halfEnd:
 fullEnd:
    }
-   #elif (defined __GNU_INLINE__)
+   #elif (defined __GNU_INLINE_X86_32__)
    ULONG tmp, tmp2;
    __asm__ __volatile__ (
      "xorl     %[xbx], %[xbx]             \n\t"
@ -493,7 +462,33 @@ fullEnd:
    );
   #else
-     #error Write inline asm for your platform.
+     PIX offset = 0;
     ulRowModulo /= 4;
     for (int q = 0; q < 2; q++)
     {
         for (PIX i = pixHeight / 2; i > 0; i--)
         {
             for (PIX j = pixWidth / 2; j > 0; j--)
             {
                 *pulDstMipmap = *(pulSrcMipmap + offset);
                 pulSrcMipmap += 2;
                 pulDstMipmap++;
             }
             for (PIX j = pixWidth / 2; j > 0; j--)
             {
                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
                 pulSrcMipmap += 2;
                 pulDstMipmap++;
             }
             pulSrcMipmap += ulRowModulo;
        }
        offset = pixWidth * 2;
     }
   #endif
  }
 }
@ -649,7 +644,7 @@ __int64 mmShifter = 0;
 __int64 mmMask  = 0;
 ULONG *pulDitherTable;
-#ifdef USE_PORTABLE_C
+#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 extern const UBYTE *pubClipByte;
 // increment a byte without overflowing it
 static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd)
@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth
 // ------------------------------- ordered matrix dithering routine
 ditherOrder:
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  union uConv
  {
    ULONG val;
    DWORD dwords[2];
    UWORD words[4];
    WORD  iwords[4];
    UBYTE bytes[8];
  };
  for (int i=0; i<pixHeight; i++) {
    int idx = i&3;
    uConv dith;
    dith.val = pulDitherTable[idx];
    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
    dith.val &= mmMask;
    uConv* src = (uConv*)(pulSrc+i*pixWidth);
    uConv* dst = (uConv*)(pulDst+i*pixWidth);
    for (int j=0; j<pixWidth; j+=2) {
      uConv p=src[0];
      for (int k=0; k<8; k++) {
        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
      }
      dst[0] = p;
      src++;
      dst++;
    }
  }
 #elif (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pulSrc]
    mov     edi,D [pulDst]
@ -852,7 +819,7 @@ nextRowO:
    emms;
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp;
  __asm__ __volatile__ (
    "movl     %[pulSrc], %%esi           \n\t"
@ -912,7 +879,33 @@ nextRowO:
  );
 #else
-  #error Write inline asm for your platform.
+  union uConv
  {
    ULONG val;
    DWORD dwords[2];
    UWORD words[4];
    WORD  iwords[4];
    UBYTE bytes[8];
  };
  for (int i=0; i<pixHeight; i++) {
    int idx = i&3;
    uConv dith;
    dith.val = pulDitherTable[idx];
    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
    dith.val &= mmMask;
    uConv* src = (uConv*)(pulSrc+i*pixWidth);
    uConv* dst = (uConv*)(pulDst+i*pixWidth);
    for (int j=0; j<pixWidth; j+=2) {
      uConv p=src[0];
      for (int k=0; k<8; k++) {
        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
      }
      dst[0] = p;
      src++;
      dst++;
    }
  }
 #endif
  goto theEnd;
@ -924,34 +917,7 @@ ditherError:
  if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
  // slModulo+=4;
  // now, dither destination
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  #if 1 //SEB doesn't works....
  for (int i=0; i<pixHeight-1; i++) {
    int step = (i&1)?-4:+4;
    const UBYTE ubMask = (mmErrDiffMask&0xff);
    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
    if(i&1) src+=pixWidth*4;
    // left to right or right to left
    for (int j=0; j<pixWidth-1; j++) {
      uConv p1, p3, p5, p7;
      src+=step;
      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
      //p1.val &= mmErrDiffMask;
      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
                                p5.words[k] = (p1.words[k]*5)>>4;
                                p7.words[k] = (p1.words[k]*7)>>4; }
      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
      for (int k=0; k<4; k++) { 
        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
      }
    }
  }
  #endif
 #elif (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    mov     esi,D [pulDst]
@ -1046,7 +1012,7 @@ allDoneE:
    emms;
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "pxor    %%mm0, %%mm0                 \n\t"
    "movl    %[pulDst], %%esi             \n\t"
@ -1157,7 +1123,32 @@ allDoneE:
  );
 #else
-  #error Write inline asm for your platform.
+  #if 1 //SEB doesn't works....
  for (int i=0; i<pixHeight-1; i++) {
    int step = (i&1)?-4:+4;
    const UBYTE ubMask = (mmErrDiffMask&0xff);
    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
    if(i&1) src+=pixWidth*4;
    // left to right or right to left
    for (int j=0; j<pixWidth-1; j++) {
      uConv p1, p3, p5, p7;
      src+=step;
      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
      //p1.val &= mmErrDiffMask;
      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
                                p5.words[k] = (p1.words[k]*5)>>4;
                                p7.words[k] = (p1.words[k]*7)>>4; }
      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
      for (int k=0; k<4; k++) { 
        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
      }
    }
  }
  #endif
 #endif
  goto theEnd;
@ -1265,7 +1256,7 @@ extern "C" {
 }
-#ifdef USE_PORTABLE_C
+#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 typedef SWORD ExtPix[4];
 static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI
    _mm_empty();  // we're done, clear out the MMX registers!
 #elif (defined USE_PORTABLE_C)
    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
    ULONG *src = pulSrc;
    ULONG *dst = pulDst;
    ULONG *rowptr = aulRows;
    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
    EXTPIXFROMINT64(mmCm);
    EXTPIXFROMINT64(mmCe);
    EXTPIXFROMINT64(mmCc);
    EXTPIXFROMINT64(mmEch);
    EXTPIXFROMINT64(mmEcl);
    EXTPIXFROMINT64(mmEe);
    EXTPIXFROMINT64(mmEm);
    EXTPIXFROMINT64(mmMm);
    EXTPIXFROMINT64(mmMe);
    EXTPIXFROMINT64(mmMc);
    EXTPIXFROMINT64(mmAdd);
    EXTPIXFROMINT64(mmInvDiv);
    #undef EXTPIXFROMINT64
    // ----------------------- process upper left corner
    extend_pixel(src[0], rmm1);
    extend_pixel(src[1], rmm2);
    extend_pixel(src[pixCanvasWidth], rmm3);
    extend_pixel(src[pixCanvasWidth+1], rmm4);
    extpix_add(rmm2, rmm3);
    extpix_mul(rmm1, rmmCm);
    extpix_mul(rmm2, rmmCe);
    extpix_mul(rmm4, rmmCc);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm4);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    *(rowptr++) = unextend_pixel(rmm1);
    src++;
    // ----------------------- process upper edge pixels
    for (PIX i = pixWidth - 2; i != 0; i--)
    {
        extend_pixel(src[-1], rmm1);
        extend_pixel(src[0], rmm2);
        extend_pixel(src[1], rmm3);
        extend_pixel(src[pixCanvasWidth-1], rmm4);
        extend_pixel(src[pixCanvasWidth], rmm5);
        extend_pixel(src[pixCanvasWidth+1], rmm6);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm4, rmm6);
        extpix_mul(rmm1, rmmEch);
        extpix_mul(rmm2, rmmEm);
        extpix_mul(rmm4, rmmEcl);
        extpix_mul(rmm5, rmmEe);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm4);
        extpix_add(rmm1, rmm5);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        *(rowptr++) = unextend_pixel(rmm1);
        src++;
    }
    // ----------------------- process upper right corner
    extend_pixel(src[-1], rmm1);
    extend_pixel(src[0], rmm2);
    extend_pixel(src[pixCanvasWidth-1], rmm3);
    extend_pixel(src[pixCanvasWidth], rmm4);
    extpix_add(rmm1, rmm4);
    extpix_mul(rmm1, rmmCe);
    extpix_mul(rmm2, rmmCm);
    extpix_mul(rmm3, rmmCc);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm3);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    *rowptr = unextend_pixel(rmm1);
 // ----------------------- process bitmap middle pixels
    dst += slCanvasWidth;
    src += slModulo1;
    // for each row
    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
    {
        rowptr = aulRows;
        // process left edge pixel
        extend_pixel(src[-pixCanvasWidth], rmm1);
        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
        extend_pixel(src[0], rmm3);
        extend_pixel(src[1], rmm4);
        extend_pixel(src[pixCanvasWidth], rmm5);
        extend_pixel(src[pixCanvasWidth+1], rmm6);
        extpix_add(rmm1, rmm5);
        extpix_add(rmm2, rmm6);
        extpix_mul(rmm1, rmmEch);
        extpix_mul(rmm2, rmmEcl);
        extpix_mul(rmm3, rmmEm);
        extpix_mul(rmm4, rmmEe);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm1, rmm4);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        *(rowptr++) = unextend_pixel(rmm1);
        src++;
        dst++;
        // for each pixel in current row
        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
        {
            // prepare upper convolution row
            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
            extend_pixel(src[-pixCanvasWidth], rmm2);
            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
            // prepare middle convolution row
            extend_pixel(src[-1], rmm4);
            extend_pixel(src[0], rmm5);
            extend_pixel(src[1], rmm6);
            // free some registers
            extpix_add(rmm1, rmm3);
            extpix_add(rmm2, rmm4);
            extpix_mul(rmm5, rmmMm);
            // prepare lower convolution row
            extend_pixel(src[pixCanvasWidth-1], rmm3);
            extend_pixel(src[pixCanvasWidth], rmm4);
            extend_pixel(src[pixCanvasWidth+1], rmm7);
            // calc weightened value
            extpix_add(rmm2, rmm6);
            extpix_add(rmm1, rmm3);
            extpix_add(rmm2, rmm4);
            extpix_add(rmm1, rmm7);
            extpix_mul(rmm2, rmmMe);
            extpix_mul(rmm1, rmmMc);
            extpix_add(rmm2, rmm5);
            extpix_add(rmm1, rmm2);
            // calc and store wightened value
            extpix_adds(rmm1, rmmAdd);
            extpix_mulhi(rmm1, rmmInvDiv);
            dst[-pixCanvasWidth] = *rowptr;
            *(rowptr++) = unextend_pixel(rmm1);
            // advance to next pixel
            src++;
            dst++;
        }
        // process right edge pixel
        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
        extend_pixel(src[-pixCanvasWidth], rmm2);
        extend_pixel(src[-1], rmm3);
        extend_pixel(src[0], rmm4);
        extend_pixel(src[pixCanvasWidth-1], rmm5);
        extend_pixel(src[pixCanvasWidth], rmm6);
        extpix_add(rmm1, rmm5);
        extpix_add(rmm2, rmm6);
        extpix_mul(rmm1, rmmEcl);
        extpix_mul(rmm2, rmmEch);
        extpix_mul(rmm3, rmmEe);
        extpix_mul(rmm4, rmmEm);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm1, rmm4);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        *rowptr = unextend_pixel(rmm1);
        // advance to next row
        src += slModulo1;
        dst += slModulo1;
    }
    // ----------------------- process lower left corner
    rowptr = aulRows;
    extend_pixel(src[-pixCanvasWidth], rmm1);
    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
    extend_pixel(src[0], rmm3);
    extend_pixel(src[1], rmm4);
    extpix_add(rmm1, rmm4);
    extpix_mul(rmm1, rmmCe);
    extpix_mul(rmm2, rmmCc);
    extpix_mul(rmm3, rmmCm);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm3);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    dst[-pixCanvasWidth] = *rowptr;
    dst[0] = unextend_pixel(rmm1);
    src++;
    dst++;
    rowptr++;
    // ----------------------- process lower edge pixels
    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
    {
        // for each pixel
        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
        extend_pixel(src[-pixCanvasWidth], rmm2);
        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
        extend_pixel(src[-1], rmm4);
        extend_pixel(src[0], rmm5);
        extend_pixel(src[1], rmm6);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm4, rmm6);
        extpix_mul(rmm1, rmmEcl);
        extpix_mul(rmm2, rmmEe);
        extpix_mul(rmm4, rmmEch);
        extpix_mul(rmm5, rmmEm);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm4);
        extpix_add(rmm1, rmm5);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        dst[0] = unextend_pixel(rmm1);
        // advance to next pixel
        src++;
        dst++;
        rowptr++;
    }
    // ----------------------- lower right corners
    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
    extend_pixel(src[-pixCanvasWidth], rmm2);
    extend_pixel(src[-1], rmm3);
    extend_pixel(src[0], rmm4);
    extpix_add(rmm2, rmm3);
    extpix_mul(rmm1, rmmCc);
    extpix_mul(rmm2, rmmCe);
    extpix_mul(rmm4, rmmCm);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm4);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    dst[-pixCanvasWidth] = *rowptr;
    dst[0] = unextend_pixel(rmm1);
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
@ -2204,7 +1936,7 @@ lowerLoop:
    emms
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  FB_pulSrc = pulSrc;
  FB_pulDst = pulDst;
@ -2537,7 +2269,264 @@ lowerLoop:
  );
 #else
-  #error Write inline asm for your platform.
+    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
    ULONG *src = pulSrc;
    ULONG *dst = pulDst;
    ULONG *rowptr = aulRows;
    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
    EXTPIXFROMINT64(mmCm);
    EXTPIXFROMINT64(mmCe);
    EXTPIXFROMINT64(mmCc);
    EXTPIXFROMINT64(mmEch);
    EXTPIXFROMINT64(mmEcl);
    EXTPIXFROMINT64(mmEe);
    EXTPIXFROMINT64(mmEm);
    EXTPIXFROMINT64(mmMm);
    EXTPIXFROMINT64(mmMe);
    EXTPIXFROMINT64(mmMc);
    EXTPIXFROMINT64(mmAdd);
    EXTPIXFROMINT64(mmInvDiv);
    #undef EXTPIXFROMINT64
    // ----------------------- process upper left corner
    extend_pixel(src[0], rmm1);
    extend_pixel(src[1], rmm2);
    extend_pixel(src[pixCanvasWidth], rmm3);
    extend_pixel(src[pixCanvasWidth+1], rmm4);
    extpix_add(rmm2, rmm3);
    extpix_mul(rmm1, rmmCm);
    extpix_mul(rmm2, rmmCe);
    extpix_mul(rmm4, rmmCc);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm4);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    *(rowptr++) = unextend_pixel(rmm1);
    src++;
    // ----------------------- process upper edge pixels
    for (PIX i = pixWidth - 2; i != 0; i--)
    {
        extend_pixel(src[-1], rmm1);
        extend_pixel(src[0], rmm2);
        extend_pixel(src[1], rmm3);
        extend_pixel(src[pixCanvasWidth-1], rmm4);
        extend_pixel(src[pixCanvasWidth], rmm5);
        extend_pixel(src[pixCanvasWidth+1], rmm6);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm4, rmm6);
        extpix_mul(rmm1, rmmEch);
        extpix_mul(rmm2, rmmEm);
        extpix_mul(rmm4, rmmEcl);
        extpix_mul(rmm5, rmmEe);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm4);
        extpix_add(rmm1, rmm5);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        *(rowptr++) = unextend_pixel(rmm1);
        src++;
    }
    // ----------------------- process upper right corner
    extend_pixel(src[-1], rmm1);
    extend_pixel(src[0], rmm2);
    extend_pixel(src[pixCanvasWidth-1], rmm3);
    extend_pixel(src[pixCanvasWidth], rmm4);
    extpix_add(rmm1, rmm4);
    extpix_mul(rmm1, rmmCe);
    extpix_mul(rmm2, rmmCm);
    extpix_mul(rmm3, rmmCc);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm3);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    *rowptr = unextend_pixel(rmm1);
 // ----------------------- process bitmap middle pixels
    dst += slCanvasWidth;
    src += slModulo1;
    // for each row
    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
    {
        rowptr = aulRows;
        // process left edge pixel
        extend_pixel(src[-pixCanvasWidth], rmm1);
        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
        extend_pixel(src[0], rmm3);
        extend_pixel(src[1], rmm4);
        extend_pixel(src[pixCanvasWidth], rmm5);
        extend_pixel(src[pixCanvasWidth+1], rmm6);
        extpix_add(rmm1, rmm5);
        extpix_add(rmm2, rmm6);
        extpix_mul(rmm1, rmmEch);
        extpix_mul(rmm2, rmmEcl);
        extpix_mul(rmm3, rmmEm);
        extpix_mul(rmm4, rmmEe);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm1, rmm4);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        *(rowptr++) = unextend_pixel(rmm1);
        src++;
        dst++;
        // for each pixel in current row
        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
        {
            // prepare upper convolution row
            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
            extend_pixel(src[-pixCanvasWidth], rmm2);
            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
            // prepare middle convolution row
            extend_pixel(src[-1], rmm4);
            extend_pixel(src[0], rmm5);
            extend_pixel(src[1], rmm6);
            // free some registers
            extpix_add(rmm1, rmm3);
            extpix_add(rmm2, rmm4);
            extpix_mul(rmm5, rmmMm);
            // prepare lower convolution row
            extend_pixel(src[pixCanvasWidth-1], rmm3);
            extend_pixel(src[pixCanvasWidth], rmm4);
            extend_pixel(src[pixCanvasWidth+1], rmm7);
            // calc weightened value
            extpix_add(rmm2, rmm6);
            extpix_add(rmm1, rmm3);
            extpix_add(rmm2, rmm4);
            extpix_add(rmm1, rmm7);
            extpix_mul(rmm2, rmmMe);
            extpix_mul(rmm1, rmmMc);
            extpix_add(rmm2, rmm5);
            extpix_add(rmm1, rmm2);
            // calc and store wightened value
            extpix_adds(rmm1, rmmAdd);
            extpix_mulhi(rmm1, rmmInvDiv);
            dst[-pixCanvasWidth] = *rowptr;
            *(rowptr++) = unextend_pixel(rmm1);
            // advance to next pixel
            src++;
            dst++;
        }
        // process right edge pixel
        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
        extend_pixel(src[-pixCanvasWidth], rmm2);
        extend_pixel(src[-1], rmm3);
        extend_pixel(src[0], rmm4);
        extend_pixel(src[pixCanvasWidth-1], rmm5);
        extend_pixel(src[pixCanvasWidth], rmm6);
        extpix_add(rmm1, rmm5);
        extpix_add(rmm2, rmm6);
        extpix_mul(rmm1, rmmEcl);
        extpix_mul(rmm2, rmmEch);
        extpix_mul(rmm3, rmmEe);
        extpix_mul(rmm4, rmmEm);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm1, rmm4);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        *rowptr = unextend_pixel(rmm1);
        // advance to next row
        src += slModulo1;
        dst += slModulo1;
    }
    // ----------------------- process lower left corner
    rowptr = aulRows;
    extend_pixel(src[-pixCanvasWidth], rmm1);
    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
    extend_pixel(src[0], rmm3);
    extend_pixel(src[1], rmm4);
    extpix_add(rmm1, rmm4);
    extpix_mul(rmm1, rmmCe);
    extpix_mul(rmm2, rmmCc);
    extpix_mul(rmm3, rmmCm);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm3);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    dst[-pixCanvasWidth] = *rowptr;
    dst[0] = unextend_pixel(rmm1);
    src++;
    dst++;
    rowptr++;
    // ----------------------- process lower edge pixels
    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
    {
        // for each pixel
        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
        extend_pixel(src[-pixCanvasWidth], rmm2);
        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
        extend_pixel(src[-1], rmm4);
        extend_pixel(src[0], rmm5);
        extend_pixel(src[1], rmm6);
        extpix_add(rmm1, rmm3);
        extpix_add(rmm4, rmm6);
        extpix_mul(rmm1, rmmEcl);
        extpix_mul(rmm2, rmmEe);
        extpix_mul(rmm4, rmmEch);
        extpix_mul(rmm5, rmmEm);
        extpix_add(rmm1, rmm2);
        extpix_add(rmm1, rmm4);
        extpix_add(rmm1, rmm5);
        extpix_adds(rmm1, rmmAdd);
        extpix_mulhi(rmm1, rmmInvDiv);
        dst[-pixCanvasWidth] = *rowptr;
        dst[0] = unextend_pixel(rmm1);
        // advance to next pixel
        src++;
        dst++;
        rowptr++;
    }
    // ----------------------- lower right corners
    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
    extend_pixel(src[-pixCanvasWidth], rmm2);
    extend_pixel(src[-1], rmm3);
    extend_pixel(src[0], rmm4);
    extpix_add(rmm2, rmm3);
    extpix_mul(rmm1, rmmCc);
    extpix_mul(rmm2, rmmCe);
    extpix_mul(rmm4, rmmCm);
    extpix_add(rmm1, rmm2);
    extpix_add(rmm1, rmm4);
    extpix_adds(rmm1, rmmAdd);
    extpix_mulhi(rmm1, rmmInvDiv);
    dst[-pixCanvasWidth] = *rowptr;
    dst[0] = unextend_pixel(rmm1);
 #endif
  // all done (finally)
--- a/Sources/Engine/Graphics/OpenGL.h
+++ b/Sources/Engine/Graphics/OpenGL.h
@ -89,20 +89,14 @@ extern void  (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param);
 inline void glCOLOR( COLOR col)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 	col = ( ((col << 24)            ) |
            ((col << 8) & 0x00FF0000) |
            ((col >> 8) & 0x0000FF00) |
            ((col >> 24)            ) );
 #elif (defined __MSVC_INLINE__)
  __asm {
    mov     eax,dword ptr [col]
    bswap   eax
    mov     dword ptr [col],eax
  }
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "bswapl   %%eax    \n\t"
        : "=a" (col)
@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col)
  );
 #else
-  #error please define for your platform.
+  col = ( ((col << 24)            ) |
          ((col << 8) & 0x00FF0000) |
          ((col >> 8) & 0x0000FF00) |
          ((col >> 24)            ) );
 #endif
  pglColor4ubv((GLubyte*)&col);
--- a/Sources/Engine/Graphics/TextureEffects.cpp
+++ b/Sources/Engine/Graphics/TextureEffects.cpp
@ -32,11 +32,9 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 #define ASMOPT 0
 #elif (defined __MSVC_INLINE__)
 #define ASMOPT 1
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
 #define ASMOPT 1
 #else
 #define ASMOPT 0
@ -1285,8 +1283,7 @@ static void RenderWater(void)
  { // SUB-SAMPLING
    SLONG slHeightMapStep, slHeightRowStep;
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  #if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      bsf     ecx,D [_pixTexWidth]
@ -1357,7 +1354,7 @@ pixLoop:
      pop     ebx
    }
-  #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
    // rcg12152001 needed extra registers. :(
    _slHeightMapStep_renderWater = slHeightMapStep;
    _pixBaseWidth_renderWater = pixBaseWidth;
@ -1460,10 +1457,6 @@ pixLoop:
          "cc", "memory"
    );
  #else
    #error fill in for your platform.
  #endif
 #else
    PIX pixPos, pixDU, pixDV;
@ -1626,7 +1619,7 @@ pixLoop2:
      pop     ebx
    }
-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
    __asm__ __volatile__ (
      "bsfl      %[pixBaseWidth], %%eax             \n\t"
      "movl      $32, %%edx                         \n\t"
@ -2146,7 +2139,7 @@ pixLoop4:
      pop     ebx
    }
-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
    __asm__ __volatile__ (
      "bsfl      %[pixBaseWidth], %%eax             \n\t"
      "movl      $32, %%edx                         \n\t"
@ -2976,7 +2969,7 @@ pixDone:
    pop     ebx
  }
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[slColumnModulo], %%edx             \n\t"
    "movl    %[slBufferMask], %%ecx               \n\t"
@ -3119,7 +3112,7 @@ pixLoopF:
    jnz     rowLoopF
    pop     ebx
  }
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  _pubHeat_RenderPlasmaFire = pubHeat;  // ran out of registers.  :/
  __asm__ __volatile__ (
    "movl    %[slHeatRowStep], %%eax     \n\t"
--- a/Sources/Engine/Light/LayerMixer.cpp
+++ b/Sources/Engine/Light/LayerMixer.cpp
@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr
 #if (defined USE_PORTABLE_C)
  #define ASMOPT 0
 #elif (defined __MSVC_INLINE__)
  #define ASMOPT 1
 #elif (defined __GNU_INLINE__)
  #define ASMOPT 1
 #else
  #define ASMOPT 0
 #endif
 extern INDEX shd_bFineQuality;
 extern INDEX shd_iFiltering;
 extern INDEX shd_iDithering;
@ -290,8 +280,7 @@ void CLayerMixer::AddAmbientPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;
-#if (ASMOPT == 1)
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -364,7 +353,7 @@ skipPixel:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -439,10 +428,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );
 #else
  #error Write inline asm for your platform.
 #endif
 #else
    // !!! FIXME WARNING: I have not checked this code, and it could be
@ -496,8 +481,7 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightStep>>=1;
-#if (ASMOPT == 1)
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -576,7 +560,7 @@ skipPixel:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -660,10 +644,6 @@ skipPixel:
          "cc", "memory"
  );
 #else
  #error Please write inline assembly for your platform.
 #endif
 #else   // Portable C version...
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -723,8 +703,7 @@ void CLayerMixer::AddDiffusionPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -796,7 +775,7 @@ skipPixel:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -871,10 +850,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );
 #else
  #error Write inline assembly for your platform.
 #endif
 #else
  // for each pixel in the shadow map
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -929,8 +904,7 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightMax<<=7;
  _slLightStep>>=1;
-#if (ASMOPT == 1)
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -1008,7 +982,7 @@ skipPixel:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -1091,11 +1065,6 @@ skipPixel:
          "cc", "memory"
  );
 #else
  #error Write inline ASM for your platform.
 #endif
 #else
  // for each pixel in the shadow map
@ -1201,8 +1170,7 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
  FLOAT fDL2oDV     = fDDL2oDV + 2*(lm_vStepV%v00);
  //_v00 = v00;
-#if ((ASMOPT == 1) && (!defined __GNU_INLINE__))
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __asm {
    fld     D [fDDL2oDU]
    fadd    D [fDDL2oDU]
@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
    fistp   D [_slDDL2oDV]
    fistp   D [_slDDL2oDU]
  }
 #elif (defined __GNU_INLINE__)
    STUBBED("inline asm.");
 #else
   #error Please write inline assembly for your platform.
 #endif
 #else
  fDDL2oDU     *= 2;
  fDDL2oDV     *= 2;
@ -1321,8 +1283,7 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp)
  _pulLayer  = lm_pulShadowMap;
  FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f);
-#if ((ASMOPT == 1) && (!defined __GNU_INLINE__))
+#if (defined __MSVC_INLINE__)
 #if (defined __MSVC_INLINE__)
  __int64 mmRowAdv;
  SLONG fixGRow  = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15
  SLONG slModulo = (lm_pixCanvasSizeU-lm_pixPolygonSizeU) *BYTES_PER_TEXEL;
@ -1436,14 +1397,6 @@ rowNext:
 rowDone:
    emms
  }
 #elif (defined __GNU_INLINE__)
    STUBBED("WRITE ME. Argh.");
 #else
  #error Need inline assembly for your platform.
 #endif
 #else
  // well, make gradient ...
  SLONG slR0=0,slG0=0,slB0=0;
@ -1528,9 +1481,8 @@ rowDone:
 // apply directional light or ambient to layer
 void CLayerMixer::AddDirectional(void)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  ULONG ulLight = ByteSwap( lm_colLight);
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare pointers and variables
    mov     edi,D [_pulLayer]
@ -1565,7 +1517,8 @@ rowNext:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1608,10 +1561,6 @@ rowNext:
        : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory"
  );
 #else
   #error Write inline assembly for your platform.
 #endif
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1631,9 +1580,8 @@ rowNext:
 // apply directional light thru mask to layer
 void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  ULONG ulLight = ByteSwap( lm_colLight);
 #if (defined __MSVC_INLINE__)
  // prepare some local variables
  __asm {
    // prepare pointers and variables
@ -1665,7 +1613,8 @@ skipLight:
    emms
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1706,10 +1655,6 @@ skipLight:
          "cc", "memory"
  );
 #else
  #error Please write inline assembly for your platform.
 #endif
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1832,7 +1777,33 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    }
  } // set initial color
- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
    mov     ecx,D [ebx].lm_pixCanvasSizeU
    imul    ecx,D [ebx].lm_pixCanvasSizeV
    mov     edi,D [ebx].lm_pulShadowMap
    mov     eax,D [colAmbient]
    bswap   eax
    rep     stosd
  }
 #elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
    "imull   %%esi, %%ecx   \n\t"
    "bswapl  %%eax          \n\t"
    "rep                    \n\t"
    "stosl                  \n\t"
        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
          "a" (colAmbient), "D" (this->lm_pulShadowMap)
        : "cc", "memory"
  );
 #else
  register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV;
  #if PLATFORM_LITTLEENDIAN
  // Forces C fallback; BYTESWAP itself is a no-op on little endian.
@ -1850,35 +1821,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    ptr++;
  }
- #elif (defined __MSVC_INLINE__)
+#endif
  __asm {
    cld
    mov     ebx,D [this]
    mov     ecx,D [ebx].lm_pixCanvasSizeU
    imul    ecx,D [ebx].lm_pixCanvasSizeV
    mov     edi,D [ebx].lm_pulShadowMap
    mov     eax,D [colAmbient]
    bswap   eax
    rep     stosd
  }
 #elif (defined __GNU_INLINE__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
    "imull   %%esi, %%ecx   \n\t"
    "bswapl  %%eax          \n\t"
    "rep                    \n\t"
    "stosl                  \n\t"
        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
          "a" (colAmbient), "D" (this->lm_pulShadowMap)
        : "cc", "memory"
  );
 #else
  #error Please write inline assembly for your platform.
 #endif
  _pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL);
@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::CopyShadowLayer(void)
 {
- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
   memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -1967,7 +1908,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
    mov     edi,D [ebx].lm_pulShadowMap
    rep     movsd
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -1980,21 +1921,16 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
        : "cc", "memory"
  );
- #else
+#else
-  #error Please write inline assembly for your platform.
+  memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
- #endif
+#endif
 }
 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
 {
- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
   DWORD* dst = (DWORD*)lm_pulShadowMap;
   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
   DWORD color = __builtin_bswap32(col);
   while(n--) {*(dst++)=color;}
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -2006,7 +1942,7 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
    rep     stosd
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -2020,9 +1956,12 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
        : "cc", "memory"
  );
- #else
+#else
-  #error Please write inline assembly for your platform.
+   DWORD* dst = (DWORD*)lm_pulShadowMap;
- #endif
+   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
   DWORD color = __builtin_bswap32(col);
   while(n--) {*(dst++)=color;}
 #endif
 }
--- a/Sources/Engine/Math/Float.cpp
+++ b/Sources/Engine/Math/Float.cpp
@ -24,20 +24,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define _PC_64    0x0300
 // !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap.  --ryan.
-#ifdef USE_PORTABLE_C
+#if (defined _MSC_VER)
 // Fake control87 for USE_PORTABLE_C version
 inline ULONG _control87(WORD newcw, WORD mask)
 {
    static WORD fpw=_PC_64;
    if (mask != 0)
    {
        fpw &= ~mask;
        fpw |= (newcw & mask);
    }
    return(fpw);
 }
-#elif (defined __GNU_INLINE__)
+// _control87 is provided by the compiler
 #elif (defined __GNU_INLINE_X86_32__)
 inline ULONG _control87(WORD newcw, WORD mask)
 {
@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask)
    return(fpw);
 }
-#elif (!defined _MSC_VER)
+#else
-#error Implement for your platform, or add a stub conditional here.
+
 // Fake control87 for USE_PORTABLE_C version
 inline ULONG _control87(WORD newcw, WORD mask)
 {
    static WORD fpw=_PC_64;
    if (mask != 0)
    {
        fpw &= ~mask;
        fpw |= (newcw & mask);
    }
    return(fpw);
 }
 #endif
 /* Get current precision setting of FPU. */
--- a/Sources/Engine/Math/Functions.h
+++ b/Sources/Engine/Math/Functions.h
@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul)
 // fast float to int conversion
 inline SLONG FloatToInt( FLOAT f)
 {
-#if defined(__arm__) || defined(USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
  return((SLONG) (f + addToRound));
 #elif (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    fld    D [f]
@ -325,7 +320,7 @@ inline SLONG FloatToInt( FLOAT f)
  }
  return slRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  SLONG slRet;
  __asm__ __volatile__ (
    "flds     (%%eax)   \n\t"
@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f)
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
  return((SLONG) (f + addToRound));
 #endif
 }
 // log base 2 of any float numero
 inline FLOAT Log2( FLOAT f) {
-#if (defined USE_PORTABLE_C) || defined(__arm__)
+#if (defined __MSVC_INLINE__)
  return log2f(f);
 #elif (defined __MSVC_INLINE__)
  FLOAT fRet;
  _asm {
    fld1
@ -355,7 +350,7 @@ inline FLOAT Log2( FLOAT f) {
  }
  return fRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  FLOAT fRet;
  __asm__ __volatile__ (
    "fld1               \n\t"
@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) {
  );
  return(fRet);
 #else
-  #error Fill this in for your platform.
+  return log2f(f);
 #endif
 }
@ -376,8 +372,24 @@ inline FLOAT Log2( FLOAT f) {
 // returns accurate values only for integers that are power of 2
 inline SLONG FastLog2( SLONG x)
 {
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
-#ifdef __GNUC__
+  SLONG slRet;
  __asm {
    bsr   eax,D [x]
    mov   D [slRet],eax
  }
  return slRet;
 #elif (defined __GNU_INLINE_X86_32__)
  SLONG slRet;
  __asm__ __volatile__ (
    "bsrl   %%ecx, %%eax      \n\t"
        : "=a" (slRet)
        : "c" (x)
        : "memory"
  );
  return(slRet);
 #elif (defined __GNUC__)
  if(x == 0) return 0; // __builtin_clz() is undefined for 0
  int numLeadingZeros  = __builtin_clz(x);
  return 31 - numLeadingZeros;
@ -393,38 +405,13 @@ inline SLONG FastLog2( SLONG x)
  return 0;
 #endif
 #elif (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    bsr   eax,D [x]
    mov   D [slRet],eax
  }
  return slRet;
 #elif (defined __GNU_INLINE__)
  SLONG slRet;
  __asm__ __volatile__ (
    "bsrl   %%ecx, %%eax      \n\t"
        : "=a" (slRet)
        : "c" (x)
        : "memory"
  );
  return(slRet);
 #else
  #error Fill this in for your platform.
 #endif
 }
 /* DG: function is unused => doesn't matter that portable implementation is not optimal :)
 // returns log2 of first larger value that is a power of 2
 inline SLONG FastMaxLog2( SLONG x)
 { 
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
 printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  return((SLONG) log2((double) x));
 #elif (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    bsr   eax,D [x]
@ -435,7 +422,7 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  }
  return slRet;
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  SLONG slRet;
  __asm__ __volatile__ (
    "bsrl  %%ecx, %%eax     \n\t"
@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  return((SLONG) log2((double) x));
 #endif
 }
 */
--- a/Sources/Engine/Models/RenderModel_View.cpp
+++ b/Sources/Engine/Models/RenderModel_View.cpp
@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr
 #if (defined __MSVC_INLINE__)
 #define ASMOPT 1
 #elif (defined __GNU_INLINE__)
 #define ASMOPT 0  // !!! FIXME: rcg10112001 Write GCC inline asm versions...
 #else
 #define ASMOPT 0
 #endif
 extern BOOL CVA_bModels;
 extern BOOL GFX_bTruform;
@ -663,7 +655,7 @@ static FLOAT   _fHazeAdd;
 // check vertex against fog
 static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tex]
@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 // check vertex against haze
 static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tx1]
@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals)
    const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1;
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1196,7 +1188,7 @@ vtxNext16:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1365,7 +1357,7 @@ vtxNext16L:
    // if no lerping
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1464,7 +1456,7 @@ vtxNext8:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
      // re-adjust stretching factors because of fixint lerping (divide by 256)
@ -1610,7 +1602,7 @@ vtxNext8L:
  }
  // generate colors from shades
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    // construct 64-bit RGBA light
@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm)
    pvtxSrfBase = &_avtxSrfBase[iSrfVx0];
    INDEX iSrfVx;
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [puwSrfToMip]
@ -2074,7 +2066,7 @@ srfVtxLoop:
    const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation);
    colSrfDiff.MultiplyRGBA( colD, colMdlDiff);
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    // setup texcoord array
    __asm {
      push    ebx
@ -2134,7 +2126,7 @@ vtxEnd:
      for( INDEX iSrfVx=0; iSrfVx<ctSrfVx; iSrfVx++) pcolSrfBase[iSrfVx] = colSrfDiffAdj;
    }
    else {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // setup color array
      const COLOR colS = colSrfDiff.ul.abgr;
      __asm {
@ -2335,7 +2327,7 @@ diffColLoop:
    // cache rotation
    const FLOATmatrix3D &m = rm.rm_mObjectRotation;
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
@ -2530,7 +2522,7 @@ reflMipLoop:
    // cache object view rotation
    const FLOATmatrix3D &m = rm.rm_mObjectToView;
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
--- a/Sources/Engine/Rendering/RendMisc.cpp
+++ b/Sources/Engine/Rendering/RendMisc.cpp
@ -105,10 +105,7 @@ static SLONG slTmp;
 static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
 {
- #if (defined USE_PORTABLE_C)
+ #if (defined __MSVC_INLINE__)
  return((PIX) (f+0.9999f));
 #elif (defined __MSVC_INLINE__)
  PIX pixRet;
  __asm {
    fld     dword ptr [f]
@ -123,7 +120,7 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
  }
  return pixRet;
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  PIX pixRet;
  SLONG clobber;
  __asm__ __volatile__ (
@ -142,7 +139,8 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
  return pixRet;
 #else
-  #error Please write inline ASM for your platform.
+  return((PIX) (f+0.9999f));
 #endif
 }
--- a/Sources/Engine/Sound/SoundMixer.cpp
+++ b/Sources/Engine/Sound/SoundMixer.cpp
@ -43,17 +43,15 @@ static CSoundData *psd;
 // nasm on MacOS X is getting wrong addresses of external globals, so I have
 //  to define them in the .asm file...lame.
-#ifdef __GNU_INLINE__
+#if (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
 #ifdef USE_PORTABLE_C
 #define INASM 
 #else
 #define INASM extern
-#endif
+#elif (defined __MSVC_INLINE__)
 #else
 #define INASM static
 static __int64 mmInvFactor   = 0x00007FFF00007FFF;
 static FLOAT f65536 = 65536.0f;
 static FLOAT f4G    = 4294967296.0f;
 #else
 #define INASM static
 #endif
 INASM SLONG slMixerBufferSize;        // size in samples per channel of the destination buffers
@ -81,11 +79,7 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
  slMixerBufferSampleRate = _pSound->sl_SwfeFormat.nSamplesPerSec;
  // wipe destination mixer buffer
-  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
+  #if (defined __MSVC_INLINE__)
  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
  #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    xor     eax,eax
@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
    shl     ecx,1 // *2 because of 32-bit src format
    rep     stosd
  }
  #elif (defined __GNU_INLINE__)
  // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()?
  ULONG clob1, clob2;
  __asm__ __volatile__ (
    "cld                  \n\t"
    "rep                  \n\t"
    "stosl                \n\t"
        : "=D" (clob1), "=c" (clob2)
        : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2)
        : "cc", "memory"
  );
  #else
-    #error please write inline asm for your platform.
+  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
  #endif
 }
@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;
-  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
+  #if (defined __MSVC_INLINE__)
  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
  #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [slSrcOffset]
@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
    shr     ecx,2   // bytes to samples per channel
    rep     movsd
  }
  #elif (defined __GNU_INLINE__)
  // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()?
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                 \n\t"
    "rep                 \n\t"
    "movsl               \n\t"
      : "=S" (clob1), "=D" (clob2), "=c" (clob3)
      : "S" (((char *)pvMixerBuffer) + slSrcOffset),
        "D" (pDstBuffer),
        "c" (slBytes >> 2)
      : "cc", "memory"
  );
  #else
-  #error please write inline asm for your platform.
+  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
  #endif
 }
@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON
  ASSERT( slBytes%2==0);
  if( slBytes<4) return;
-  #if (defined USE_PORTABLE_C)
+  #if (defined __MSVC_INLINE__)
  // (This is untested, currently. --ryan.)
  WORD *dest = (WORD *) pDstBuffer;
  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
  SLONG max = slBytes / 4;
  for (SLONG i = 0; i < max; i++) {
      *dest = *src;
      dest++;    // move 16 bits.
      src+=2;    // move 32 bits.
  }
  #elif (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [slSrcOffset]
    add     esi,D [pvMixerBuffer]
@ -184,7 +140,7 @@ copyLoop:
    jnz     copyLoop
  }
-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     %[pvMixerBuffer], %%esi         \n\t"
    "movl     %[pDstBuffer], %%edi            \n\t"
@ -204,7 +160,15 @@ copyLoop:
  );
  #else
-  #error please write inline asm for your platform.
+  // (This is untested, currently. --ryan.)
  WORD *dest = (WORD *) pDstBuffer;
  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
  SLONG max = slBytes / 4;
  for (SLONG i = 0; i < max; i++) {
      *dest = *src;
      dest++;    // move 16 bits.
      src+=2;    // move 32 bits.
  }
  #endif
 }
@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes)
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;
-  #if (defined USE_PORTABLE_C)
+  #if (defined __MSVC_INLINE__)
  //STUBBED("ConvertMixerBuffer");
  SWORD *dest = (SWORD *) pvMixerBuffer;
  SLONG *src = (SLONG *) pvMixerBuffer;
  SLONG max = slBytes / 2;
  int tmp;
  for (SLONG i = 0; i < max; i++) {
      tmp = *src;
      if (tmp>32767) tmp=32767;
      if (tmp<-32767) tmp=-32767;
      *dest=tmp;
      dest++;    // move 16 bits.
      src++;     // move 32 bits.
  }
  #elif (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [pvMixerBuffer]
@ -250,7 +197,7 @@ copyLoop:
    emms
  }
-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     %[pvMixerBuffer], %%esi      \n\t"
    "movl     %[pvMixerBuffer], %%edi      \n\t"
@ -271,7 +218,20 @@ copyLoop:
  );
  #else
-  #error please write inline asm for your platform.
+
  SWORD *dest = (SWORD *) pvMixerBuffer;
  SLONG *src = (SLONG *) pvMixerBuffer;
  SLONG max = slBytes / 2;
  int tmp;
  for (SLONG i = 0; i < max; i++) {
      tmp = *src;
      if (tmp>32767) tmp=32767;
      if (tmp<-32767) tmp=-32767;
      *dest=tmp;
      dest++;    // move 16 bits.
      src++;     // move 32 bits.
  }
  #endif
 }
@ -323,7 +283,7 @@ void NormalizeMixerBuffer( const FLOAT fNormStrength, const SLONG slBytes, FLOAT
 }
-#ifdef __GNU_INLINE__
+#if (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
 // These are implemented in an external NASM file.
 extern "C" {
    void MixStereo_asm(CSoundObject *pso);
@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso)
 {
  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
- #if (defined USE_PORTABLE_C)
+ #if (defined __MSVC_INLINE__)
  // initialize some local vars
  SLONG slLeftSample, slRightSample, slNextSample;
  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
  SLONG slLeftVolume_ = slLeftVolume >> 16;
  SLONG slRightVolume_ = slRightVolume >> 16;
  // loop thru source buffer
  INDEX iCt = slMixerBufferSize;
  FOREVER
  {
    // if left channel source sample came to end of sample buffer
    if( fixLeftOfs >= fixSoundBufferSize) {
      fixLeftOfs -= fixSoundBufferSize;
      // if has no loop, end it
      bEndOfSound = bNotLoop;
    }
    // if right channel source sample came to end of sample buffer
    if( fixRightOfs >= fixSoundBufferSize) {
      fixRightOfs -= fixSoundBufferSize;
      // if has no loop, end it
      bEndOfSound = bNotLoop;
    }
    // end of buffer?
    if( iCt<=0 || bEndOfSound) break;
    // fetch one lineary interpolated sample on left channel
    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
    // fetch one lineary interpolated sample on right channel
    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
    // filter samples
    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
    // apply stereo volume to current sample
    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
    slRightSample = (slLastRightSample * slRightVolume_)>>15;
    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
    // mix in current sample
    slLeftSample  += pslDstBuffer[0];
    slRightSample += pslDstBuffer[1];
    // upper clamp
    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
    // lower clamp
    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
    // store samples (both channels)
    pslDstBuffer[0] = slLeftSample;
    pslDstBuffer[1] = slRightSample;
    // modify volume  `
    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
    // advance to next sample
    fixLeftOfs   += fixLeftStep;
    fixRightOfs  += fixRightStep;
    pslDstBuffer += 2;
    iCt--;
  }
 #elif (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -548,24 +430,11 @@ loopEnd:
    emms
  }
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
   // This is implemented in an external NASM file.
   MixMono_asm(pso);
 #else
   #error please write inline asm for your platform.
 #endif
  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
 }
 // mixes one stereo 16-bit signed sound to destination buffer
 inline void MixStereo( CSoundObject *pso)
 {
  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
 #if (defined USE_PORTABLE_C)
  // initialize some local vars
  SLONG slLeftSample, slRightSample, slNextSample;
  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso)
    if( iCt<=0 || bEndOfSound) break;
    // fetch one lineary interpolated sample on left channel
-    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
+    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
-    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
+    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
    // fetch one lineary interpolated sample on right channel
-    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
+    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
-    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
+    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
    // filter samples
@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso)
    iCt--;
  }
- #elif (defined __MSVC_INLINE__)
+ #endif
  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
 }
 // mixes one stereo 16-bit signed sound to destination buffer
 inline void MixStereo( CSoundObject *pso)
 {
  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
 #if (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -778,12 +658,88 @@ loopEnd:
    emms
  }
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
   // This is implemented in an external NASM file.
   MixStereo_asm(pso);
 #else
-   #error please write inline asm for your platform.
+  // initialize some local vars
  SLONG slLeftSample, slRightSample, slNextSample;
  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
  SLONG slLeftVolume_ = slLeftVolume >> 16;
  SLONG slRightVolume_ = slRightVolume >> 16;
  // loop thru source buffer
  INDEX iCt = slMixerBufferSize;
  FOREVER
  {
    // if left channel source sample came to end of sample buffer
    if( fixLeftOfs >= fixSoundBufferSize) {
      fixLeftOfs -= fixSoundBufferSize;
      // if has no loop, end it
      bEndOfSound = bNotLoop;
    }
    // if right channel source sample came to end of sample buffer
    if( fixRightOfs >= fixSoundBufferSize) {
      fixRightOfs -= fixSoundBufferSize;
      // if has no loop, end it
      bEndOfSound = bNotLoop;
    }
    // end of buffer?
    if( iCt<=0 || bEndOfSound) break;
    // fetch one lineary interpolated sample on left channel
    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
    // fetch one lineary interpolated sample on right channel
    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
    // filter samples
    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
    // apply stereo volume to current sample
    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
    slRightSample = (slLastRightSample * slRightVolume_)>>15;
    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
    // mix in current sample
    slLeftSample  += pslDstBuffer[0];
    slRightSample += pslDstBuffer[1];
    // upper clamp
    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
    // lower clamp
    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
    // store samples (both channels)
    pslDstBuffer[0] = slLeftSample;
    pslDstBuffer[1] = slRightSample;
    // modify volume  `
    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
    // advance to next sample
    fixLeftOfs   += fixLeftStep;
    fixRightOfs  += fixRightStep;
    pslDstBuffer += 2;
    iCt--;
  }
 #endif
  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
--- a/Sources/build-linux32.sh
+++ b/Sources/build-linux32.sh
@ -14,10 +14,10 @@ cd $_
 #ninja
 # This is the eventual path for amd64.
-#cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE ..
+#cmake -DCMAKE_BUILD_TYPE=Debug ..
 # Right now we force x86, though...
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_ASM=TRUE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_NASM_ASM=TRUE ..
 make -j$NCPU
--- a/Sources/build-linux64.sh
+++ b/Sources/build-linux64.sh
@ -14,7 +14,7 @@ cd $_
 #ninja
 # This is the eventual path for amd64.
-cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug ..
 # Right now we force x86, though...
 #cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 ..
--- a/Sources/build-mac.sh
+++ b/Sources/build-mac.sh
@ -9,6 +9,6 @@ set -x
 rm -rf cmake-build
 mkdir $_
 cd $_
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_NASM_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE ..
 make -j$NCPU
--- a/Sources/build-mac64.sh
+++ b/Sources/build-mac64.sh
@ -9,6 +9,6 @@ set -x
 rm -rf cmake-build
 mkdir $_
 cd $_
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 -DUSE_I386_ASM=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 ..
 make -j$NCPU