Merge pull request #47 from notaz/asm_rework

Asm rework
2025-09-18 18:30:05 +02:00 · 2016-05-09 12:03:16 -04:00 · 2016-05-09 12:03:16 -04:00 · a8c6c77309
commit a8c6c77309
parent 056d77f479 ce46bd1e99
24 changed files with 926 additions and 1142 deletions
--- a/Sources/CMakeLists.txt
+++ b/Sources/CMakeLists.txt
@ -188,13 +188,20 @@ else()
    set(DEBUGSUFFIX "")
 endif()

-# This should not be needed anymore, but might be faster on 32bit x86
-option(USE_I386_ASM "Use X86 ASM" FALSE)
+option(USE_ASM "Use ASM code" TRUE)
+if (USE_ASM)
+    MESSAGE(STATUS "Using assembler code (when available)")
+else()
+    add_definitions(-DUSE_PORTABLE_C=1)
+    MESSAGE(STATUS "Using portable C instead of all ASM")
+endif()

-if (USE_I386_ASM)
+option(USE_I386_NASM_ASM "Use i386 nasm ASM code" FALSE)
+
+if (USE_ASM AND USE_I386_NASM_ASM)
    # You need the Netwide Assembler (NASM) to build this on Intel systems.
    #   http://nasm.sf.net/
-    add_definitions(-DUSE_I386_ASM=1)
+    add_definitions(-DUSE_I386_NASM_ASM=1)
    if (MACOSX)
        set(ASMOBJFMT "macho")
        list(APPEND ASMFLAGS --prefix _)
@ -203,10 +210,9 @@ if (USE_I386_ASM)
    else()
        set(ASMOBJFMT "elf")
    endif()
-    MESSAGE(STATUS "Using i386 assembler")
+    MESSAGE(STATUS "Using i386 nasm ASM")
 else()
-    add_definitions(-DUSE_PORTABLE_C=1)
-    MESSAGE(STATUS "Using portable C instead of ASM")
+    MESSAGE(STATUS "Not using i386 nasm ASM")
 endif()

 option(PANDORA "Compile for Pandora" FALSE)
@ -655,7 +661,7 @@ add_dependencies(${SHADERSLIB} ParseEntities)
 add_parser_and_scanner("Engine/Base/Parser" "Engine/Base/Scanner")
 add_parser_and_scanner("Engine/Ska/smcPars" "Engine/Ska/smcScan")

-if (USE_I386_ASM)
+if (USE_I386_NASM_ASM)
    add_custom_command(
        OUTPUT "SoundMixer386.o"
        MAIN_DEPENDENCY "${CMAKE_CURRENT_SOURCE_DIR}/Engine/Sound/SoundMixer386.asm"
--- a/Sources/Engine/Base/Base.h
+++ b/Sources/Engine/Base/Base.h
@ -65,8 +65,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #else
  #warning "UNKNOWN PLATFORM IDENTIFIED!!!!"
  #define PLATFORM_UNKNOWN 1
-  #warning "USING PORTABLE C!!!"
-  #define USE_PORTABLE_C
 #endif

 #if PLATFORM_LINUX || PLATFORM_MACOSX
--- a/Sources/Engine/Base/Profiling.cpp
+++ b/Sources/Engine/Base/Profiling.cpp
@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 template class CStaticArray<CProfileCounter>;
 template class CStaticArray<CProfileTimer>;

-#if (defined USE_PORTABLE_C)
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #include <sys/time.h>
 #endif

 static inline __int64 ReadTSC_profile(void)
 {
-#if (defined USE_PORTABLE_C)
-  #ifdef __arm__
-  struct timespec tv;
-  clock_gettime(CLOCK_MONOTONIC, &tv);
-  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
-  #else
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
-  #endif
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __int64 mmRet;
  __asm {
    rdtsc
@ -47,7 +36,7 @@ static inline __int64 ReadTSC_profile(void)
  }
  return mmRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __int64 mmRet;
  __asm__ __volatile__ (
    "rdtsc                    \n\t"
@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void)
  return(mmRet);

 #else
-  #error Please implement for your platform/compiler.
+  #ifdef __arm__
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
+  #else
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
+  #endif
+
 #endif
 }

--- a/Sources/Engine/Base/Timer.cpp
+++ b/Sources/Engine/Base/Timer.cpp
@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #include <Engine/Base/Priority.inl>

 // !!! FIXME: use SDL timer code instead and rdtsc never?
-#if (USE_PORTABLE_C) 
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #define USE_GETTIMEOFDAY 1
 #endif

@ -64,7 +64,7 @@ static inline __int64 ReadTSC(void)
  }
  return mmRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __int64 mmRet;
  __asm__ __volatile__ (
    "rdtsc                    \n\t"
--- a/Sources/Engine/Base/Types.h
+++ b/Sources/Engine/Base/Types.h
@ -109,6 +109,30 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
  #define ASMSYM(x) #x
 #endif

+/* should we enable inline asm? */
+#ifndef USE_PORTABLE_C
+  #if defined(__MSVC_INLINE__)
+    /* the build system selected __MSVC_INLINE__ */
+  #elif defined(__GNU_INLINE_X86_32__)
+    /* the build system selected __GNU_INLINE_X86_32__ */
+  #elif defined(_MSC_VER) && defined(_M_IX86)
+    #define __MSVC_INLINE__
+  #elif defined (__GNUC__) && defined(__i386)
+    #define __GNU_INLINE_X86_32__
+  #elif defined (__GNUC__) && defined(__x86_64__)
+    #define __GNU_INLINE_X86_64__
+  #endif
+
+  #if defined(__GNU_INLINE_X86_32__) || defined(__GNU_INLINE_X86_64__)
+    #define __GNU_INLINE_X86__
+  #endif
+
+  #if defined(__GNU_INLINE_X86__)
+    #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
+    #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
+  #endif
+#endif
+
 #ifdef PLATFORM_UNIX  /* rcg10042001 */
    #include <stdio.h>
    #include <string.h>
@ -134,25 +158,6 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
      #endif
    #endif

-    #if ((defined __GNUC__) && (!defined __GNU_INLINE__))
-      #define __GNU_INLINE__
-    #endif
-
-    #if (defined __INTEL_COMPILER)
-      #if ((!defined __GNU_INLINE__) && (!defined __MSVC_INLINE__))
-        #error Please define __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++.
-      #endif
-
-      #if ((defined __GNU_INLINE__) && (defined __MSVC_INLINE__))
-        #error Define either __GNU_INLINE__ or __MSVC_INLINE__ with Intel C++.
-      #endif
-    #endif
-
-    #if defined(__GNU_INLINE__) && defined(__i386__)
-      #define FPU_REGS "st", "st(1)", "st(2)", "st(3)", "st(4)", "st(5)", "st(6)", "st(7)"
-      #define MMX_REGS "mm0", "mm1", "mm2", "mm3", "mm4", "mm5", "mm6", "mm7"
-    #endif
-
    #ifndef PAGESIZE
      #define PAGESIZE 4096
    #endif
@ -230,10 +235,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));

    inline ULONG _rotl(ULONG ul, int bits)
    {
-        #if (defined USE_PORTABLE_C)
-            // DG: according to http://blog.regehr.org/archives/1063 this is fast
-            return (ul<<bits) | (ul>>(-bits&31));
-        #elif (defined __GNU_INLINE__)
+        #if (defined __GNU_INLINE_X86_32__)
            // This, on the other hand, is wicked fast.  :)
            __asm__ __volatile__ (
                "roll %%cl, %%eax    \n\t"
@ -255,7 +257,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
            return(ul);

        #else
-            #error need inline asm for your platform.
+            // DG: according to http://blog.regehr.org/archives/1063 this is fast
+            return (ul<<bits) | (ul>>(-bits&31));
        #endif
    }

--- a/Sources/Engine/Engine.cpp
+++ b/Sources/Engine/Engine.cpp
@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD  ul_reason_for_call, LPVOID lpReser

 static void DetectCPU(void)
 {
-#if (defined USE_PORTABLE_C)  // rcg10072001
-  CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
-
-#else
-  char strVendor[12+1];
+  char strVendor[12+1] = { 0 };
  strVendor[12] = 0;
-  ULONG ulTFMS;
-  ULONG ulFeatures;
+  ULONG ulTFMS = 0;
+  ULONG ulFeatures = 0;

  #if (defined __MSVC_INLINE__)
  // test MMX presence and update flag
@ -148,42 +144,46 @@ static void DetectCPU(void)
    mov     dword ptr [ulFeatures], edx
  }

-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86__)
+    ULONG eax, ebx, ecx, edx;
    // test MMX presence and update flag
    __asm__ __volatile__ (
-        "pushl   %%ebx            \n\t"
-        "xorl    %%eax,%%eax      \n\t"  // request for basic id
+    #if (defined __GNU_INLINE_X86_64__)
        "cpuid                    \n\t"
-        "movl    %%ebx,  (%%esi)  \n\t"
-        "movl    %%edx, 4(%%esi)  \n\t"
-        "movl    %%ecx, 8(%%esi)  \n\t"
-        "popl    %%ebx            \n\t"
-            : // no specific outputs.
-            : "S" (strVendor)
-            : "eax", "ecx", "edx", "memory"
+            : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
+    #else
+        "movl    %%ebx, %%esi     \n\t"
+        "cpuid                    \n\t"
+        "xchgl   %%ebx, %%esi     \n\t"
+            : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
+    #endif
+            : "a" (0) // request for basic id
    );
-
-        // need to break this into a separate asm block, since I'm clobbering
-        //  too many registers. There's something to be said for letting MSVC
-        //  figure out where on the stack your locals are resting, but yeah,
-        //  I know, that's x86-specific anyhow...
-        // !!! FIXME: can probably do this right with modern GCC.
+    memcpy(strVendor + 0, &ebx, 4);
+    memcpy(strVendor + 4, &edx, 4);
+    memcpy(strVendor + 8, &ecx, 4);

    __asm__ __volatile__ (
-        "pushl   %%ebx                  \n\t"
-        "movl    $1, %%eax              \n\t"  // request for TFMS feature flags
+    #if (defined __GNU_INLINE_X86_64__)
        "cpuid                    \n\t"
-        "mov     %%eax, (%%esi)         \n\t"  // remember type, family, model and stepping
-        "mov     %%edx, (%%edi)         \n\t"
-        "popl    %%ebx                  \n\t"
-            : // no specific outputs.
-            : "S" (&ulTFMS), "D" (&ulFeatures)
-            : "eax", "ecx", "edx", "memory"
-    );
-
+            : "=a" (eax), "=b" (ebx), "=c" (ecx), "=d" (edx)
    #else
-    #error Please implement for your platform or define USE_PORTABLE_C.
+        "movl    %%ebx, %%esi     \n\t"
+        "cpuid                    \n\t"
+        "xchgl   %%ebx, %%esi     \n\t"
+            : "=a" (eax), "=S" (ebx), "=c" (ecx), "=d" (edx)
    #endif
+            : "a" (1) // request for TFMS feature flags
+    );
+    ulTFMS = eax;
+    ulFeatures = edx;
+
+  #endif
+
+  if (ulTFMS == 0) {
+    CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
+    return;
+  }

  INDEX iType     = (ulTFMS>>12)&0x3;
  INDEX iFamily   = (ulTFMS>> 8)&0xF;
@ -215,8 +215,6 @@ static void DetectCPU(void)
  sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6);

  if( !bMMX) FatalError( TRANS("MMX support required but not present!"));
-
-#endif  // defined USE_PORTABLE_C
 }

 static void DetectCPUWrapper(void)
--- a/Sources/Engine/Graphics/Color.cpp
+++ b/Sources/Engine/Graphics/Color.cpp
@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
  if( col2==0xFFFFFFFF)   return col1;
  if( col1==0 || col2==0) return 0;

-#if (defined USE_PORTABLE_C)
-  // !!! FIXME: This...is not fast.
-  union
-  {
-    COLOR col;
-    UBYTE bytes[4];
-  } conv1;
-
-  union
-  {
-    COLOR col;
-    UBYTE bytes[4];
-  } conv2;
-
-  conv1.col = col1;
-  conv2.col = col2;
-  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
-  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
-  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
-  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
-
-  return(conv1.col);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  COLOR colRet;
  __asm {
    xor     ebx,ebx
@ -347,7 +324,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
  }
  return colRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  COLOR colRet;
  __asm__ __volatile__ (
    "pushl     %%ebx                \n\t"
@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2)

  return colRet;
 #else
-  #error please fill in inline assembly for your platform.
-#endif
-}
-
-
-// fast color additon function - RES = clamp (1ST + 2ND)
-COLOR AddColors( COLOR col1, COLOR col2) 
-{
-  if( col1==0) return col2;
-  if( col2==0) return col1;
-  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
-  COLOR colRet;
-
-#if (defined USE_PORTABLE_C)
  // !!! FIXME: This...is not fast.
  union
  {
@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2)
    COLOR col;
    UBYTE bytes[4];
  } conv2;
-  #define MINVAL(a, b) ((a)>(b))?(b):(a)

  conv1.col = col1;
  conv2.col = col2;
-  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
-  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
-  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
-  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
-  #undef MINVAL
+  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
+  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
+  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
+  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);

-  colRet = conv1.col;
+  return(conv1.col);
+#endif
+}

-#elif (defined __MSVC_INLINE__)
+
+// fast color additon function - RES = clamp (1ST + 2ND)
+COLOR AddColors( COLOR col1, COLOR col2) 
+{
+  if( col1==0) return col2;
+  if( col2==0) return col1;
+  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
+  COLOR colRet;
+
+#if (defined __MSVC_INLINE__)
  __asm {
    xor     ebx,ebx
    mov     esi,255
@ -535,7 +507,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
    mov     D [colRet],ebx
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp;
  __asm__ __volatile__ (
    // if xbx is "r", gcc runs out of regs in -fPIC + -fno-omit-fp :(
@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2)
  );

 #else
-  #error please fill in inline assembly for your platform.
+  // !!! FIXME: This...is not fast.
+  union
+  {
+    COLOR col;
+    UBYTE bytes[4];
+  } conv1;
+
+  union
+  {
+    COLOR col;
+    UBYTE bytes[4];
+  } conv2;
+  #define MINVAL(a, b) ((a)>(b))?(b):(a)
+
+  conv1.col = col1;
+  conv2.col = col2;
+  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
+  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
+  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
+  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
+  #undef MINVAL
+
+  colRet = conv1.col;
 #endif

  return colRet;
@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
 // multiple conversion from OpenGL color to DirectX color
 extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct)
 {
-#if (defined USE_PORTABLE_C)
-  //#error write me.
-  for (int i=0; i<ct; i++) {
-    ULONG tmp = pulSrc[i];
-    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
-  }
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov   esi,dword ptr [pulSrc]
    mov   edi,dword ptr [pulDst]
@ -678,12 +665,12 @@ colSkip2:
    mov   dword ptr [edi],eax
 colSkip1:
  }
-
-#elif (defined __GNU_INLINE__)
-  STUBBED("convert to inline asm.");
-
 #else
-  #error please fill in inline assembly for your platform.
+  for (int i=0; i<ct; i++) {
+    ULONG tmp = pulSrc[i];
+    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
+  }
+
 #endif
 }

--- a/Sources/Engine/Graphics/Color.h
+++ b/Sources/Engine/Graphics/Color.h
@ -204,7 +204,24 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito
 __forceinline ULONG ByteSwap( ULONG ul)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
+  ULONG ulRet;
+  __asm {
+    mov   eax,dword ptr [ul]
+    bswap eax
+    mov   dword ptr [ulRet],eax
+  }
+  return ulRet;
+
+#elif (defined __GNU_INLINE_X86_32__)
+  __asm__ __volatile__ (
+    "bswapl   %%eax    \n\t"
+        : "=a" (ul)
+        : "a" (ul)
+  );
+  return(ul);
+
+#else
  ul = ( ((ul << 24)            ) |
         ((ul << 8) & 0x00FF0000) |
         ((ul >> 8) & 0x0000FF00) |
@ -215,35 +232,12 @@ __forceinline ULONG ByteSwap( ULONG ul)
  #endif

  return(ul);
-
-#elif (defined __MSVC_INLINE__)
-  ULONG ulRet;
-  __asm {
-    mov   eax,dword ptr [ul]
-    bswap eax
-    mov   dword ptr [ulRet],eax
-  }
-  return ulRet;
-
-#elif (defined __GNU_INLINE__)
-  __asm__ __volatile__ (
-    "bswapl   %%eax    \n\t"
-        : "=a" (ul)
-        : "a" (ul)
-  );
-  return(ul);
-
-#else
-  #error please define for your platform.
 #endif
 }

 __forceinline ULONG rgba2argb( ULONG ul)
 {
-#if (defined USE_PORTABLE_C)
-	return( (ul << 24) | (ul >> 8) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [ul]
@ -252,7 +246,7 @@ __forceinline ULONG rgba2argb( ULONG ul)
  }
  return ulRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulRet;
  __asm__ __volatile__ (
    "rorl   $8, %%eax       \n\t"
@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul)
  return ulRet;

 #else
-  #error please define for your platform.
+  return (ul << 24) | (ul >> 8);
+
 #endif
 }

 __forceinline ULONG abgr2argb( COLOR col)
 {
-#if (defined USE_PORTABLE_C)
-	// this could be simplified, this is just a safe conversion from asm code
-	col = ( ((col << 24)            ) |
-            ((col << 8) & 0x00FF0000) |
-            ((col >> 8) & 0x0000FF00) |
-            ((col >> 24)            ) );
-	return( (col << 24) | (col >> 8) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [col]
@ -287,7 +274,7 @@ __forceinline ULONG abgr2argb( COLOR col)
  }
  return ulRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG ulRet;
  __asm__ __volatile__ (
    "bswapl %%eax           \n\t"
@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col)
  return ulRet;

 #else
-  #error please define for your platform.
+  // this could be simplified, this is just a safe conversion from asm code
+  col = ( ((col << 24)            ) |
+          ((col << 8) & 0x00FF0000) |
+          ((col >> 8) & 0x0000FF00) |
+          ((col >> 24)            ) );
+  return( (col << 24) | (col >> 8) );
+
 #endif
 }

@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct);
 // fast memory copy of ULONGs
 inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 {
-#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  memcpy( pulDst, pulSrc, ctLongs*4);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   esi,dword ptr [pulSrc]
@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
    mov   ecx,dword ptr [ctLongs]
    rep   movsd
  }
-
-#elif (defined __GNU_INLINE__)
-    // I haven't benchmarked it, but in many cases, memcpy() becomes an
-    //  inline (asm?) macro on GNU platforms, so this might not be a
-    //  speed gain at all over the USE_PORTABLE_C version.
-    // You Have Been Warned. --ryan.
-  __asm__ __volatile__ (
-    "cld    \n\t"
-    "rep    \n\t"
-    "movsd  \n\t"
-        : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs)
-        : "S" (pulSrc), "D" (pulDst), "c" (ctLongs)
-        : "cc", "memory"
-  );
-
 #else
-# error Please fill this in for your platform.
+  memcpy( pulDst, pulSrc, ctLongs*4);
 #endif
 }

@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 // fast memory set of ULONGs
 inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
 {
-#if (defined USE_PORTABLE_C)
-  for( INDEX i=0; i<ctLongs; i++)
-    pulDst[i] = ulVal;
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   eax,dword ptr [ulVal]
@ -359,7 +330,7 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
    rep   stosd
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "cld    \n\t"
    "rep    \n\t"
@ -370,7 +341,9 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
  );

 #else
-# error Please fill this in for your platform.
+  for( INDEX i=0; i<ctLongs; i++)
+    pulDst[i] = ulVal;
+
 #endif
 }

--- a/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
+++ b/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
@ -38,16 +38,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-#define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
-#define ASMOPT 1
-#elif (defined __GNU_INLINE__)
-#define ASMOPT 1
-#else
-#define ASMOPT 0
-#endif
-
 #define MAXTEXUNITS   4
 #define SHADOWTEXTURE 3

@ -153,7 +143,6 @@ void AddElements( ScenePolygon *pspo)
  const INDEX ctElems = pspo->spo_ctElements;
  INDEX *piDst = _aiElements.Push(ctElems);

-#if (ASMOPT == 1)
 #if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,D [pspo]
@ -184,7 +173,7 @@ elemRest:
    mov     D [edi],eax
 elemDone:
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[ctElems], %%ecx      \n\t"
    "movl    %[piDst], %%edi        \n\t"
@ -219,11 +208,6 @@ elemDone:
          "cc", "memory"
  );

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else
  const INDEX iVtx0Pass = pspo->spo_iVtx0Pass;
  const INDEX *piSrc = pspo->spo_piElements;
@ -495,8 +479,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
  // determine maximum used groups
  ASSERT( _ctGroupsCount);

-#if ASMOPT == 1
-
 #if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,2
@ -505,7 +487,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
    mov     D [_ctGroupsCount],eax
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     $2, %%eax          \n\t"
    "bsrl     (%%esi), %%ecx     \n\t"
@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
        : "eax", "ecx", "cc", "memory"
  );

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else
  // emulate x86's bsr opcode...not fast.  :/
  register DWORD val = _ctGroupsCount;
@ -858,9 +835,6 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn
      continue;
    }

-// !!! FIXME: rcg11232001 This inline conversion is broken. Use the
-// !!! FIXME: rcg11232001  C version for now with GCC.
-#if ((ASMOPT == 1) && (!defined __GNU_INLINE__) && (!defined __INTEL_COMPILER))
 #if (defined __MSVC_INLINE__)
    __asm {
      mov     esi,D [pspo]
@ -915,7 +889,7 @@ vtxLoop:
 /*
    // !!! FIXME: rcg11232001 This inline conversion is broken. Use the
    // !!! FIXME: rcg11232001  C version for now on Linux.
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
    STUBBED("debug this");
    __asm__ __volatile__ (
      "0:                                  \n\t" // vtxLoop
@ -956,11 +930,6 @@ vtxLoop:
    );
 */

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else

    // diffuse mapping
--- a/Sources/Engine/Graphics/Fog.cpp
+++ b/Sources/Engine/Graphics/Fog.cpp
@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ)
  // need to upload from RGBA format
  const PIX pixTextureSize = pixSizeI*pixSizeJ;

- #if (defined USE_PORTABLE_C)
-   const UBYTE* src = pubTexture;
-   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
-   for (int i=0; i<pixTextureSize; i++) {
-    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
-    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
-      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
-    src++;
-    dst++;
-   }
-
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pubTexture]
    mov     edi,D [pubTexture]
@ -95,7 +84,7 @@ pixLoop:
    jnz     pixLoop
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[pubTexture], %%esi      \n\t"
    "movl    %[pixTextureSize], %%ecx  \n\t"
@ -116,7 +105,15 @@ pixLoop:
  );

 #else
-   #error Write inline ASM for your platform.
+   const UBYTE* src = pubTexture;
+   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
+   for (int i=0; i<pixTextureSize; i++) {
+    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
+    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
+      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
+    src++;
+    dst++;
+   }

 #endif

--- a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
+++ b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      if( pixSizeV==0) pixSizeV=1;
      pixSize = pixSizeU*pixSizeV;

-      #if (defined USE_PORTABLE_C)
-      // Basically average every other pixel...
-      UWORD w = 0;
-      UBYTE *dptr = (UBYTE *) pulDst;
-      UBYTE *sptr = (UBYTE *) pulSrc;
-      #if 0
-      pixSize *= 4;
-      for (PIX i = 0; i < pixSize; i++)
-      {
-        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
-        dptr++;
-        sptr += 2;
-      }
-      #else
-      for (PIX i = 0; i < pixSize; i++)
-      {
-        for (PIX j = 0; j < 4; j++)
-        {
-          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
-          dptr++;
-          sptr++;
-        }
-        sptr += 4;
-      }
-      #endif
-      #elif (defined __MSVC_INLINE__)
+      #if (defined __MSVC_INLINE__)
      __asm {   
        pxor    mm0,mm0
        mov     esi,D [pulSrc]
@ -216,7 +191,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
        emms
      }

-      #elif (defined __GNU_INLINE__)
+      #elif (defined __GNU_INLINE_X86_32__)
      __asm__ __volatile__ (
        "pxor      %%mm0,%%mm0                \n\t"
        "movl      %[pulSrc],%%esi            \n\t"
@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      );

      #else
-      #error Please write inline ASM for your platform.
+      // Basically average every other pixel...
+      UWORD w = 0;
+      UBYTE *dptr = (UBYTE *) pulDst;
+      UBYTE *sptr = (UBYTE *) pulSrc;
+      #if 0
+      pixSize *= 4;
+      for (PIX i = 0; i < pixSize; i++)
+      {
+        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
+        dptr++;
+        sptr += 2;
+      }
+      #else
+      for (PIX i = 0; i < pixSize; i++)
+      {
+        for (PIX j = 0; j < 4; j++)
+        {
+          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
+          dptr++;
+          sptr++;
+        }
+        sptr += 4;
+      }
+      #endif
      #endif

      // upload mipmap
--- a/Sources/Engine/Graphics/Graphics.cpp
+++ b/Sources/Engine/Graphics/Graphics.cpp
@ -209,7 +209,92 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
  if( bBilinear) // type of filtering?
  { // BILINEAR

-   #if (defined USE_PORTABLE_C)
+   #if (defined __MSVC_INLINE__)
+    __asm {
+      pxor    mm0,mm0
+      mov     ebx,D [pixWidth]
+      mov     esi,D [pulSrcMipmap]
+      mov     edi,D [pulDstMipmap]
+      mov     edx,D [pixHeight]
+rowLoop:
+      mov     ecx,D [pixWidth]
+pixLoopN:           
+      movd    mm1,D [esi+ 0]        // up-left
+      movd    mm2,D [esi+ 4]        // up-right
+      movd    mm3,D [esi+ ebx*8 +0] // down-left
+      movd    mm4,D [esi+ ebx*8 +4] // down-right
+      punpcklbw mm1,mm0
+      punpcklbw mm2,mm0
+      punpcklbw mm3,mm0
+      punpcklbw mm4,mm0
+      paddw   mm1,mm2
+      paddw   mm1,mm3
+      paddw   mm1,mm4
+      paddw   mm1,Q [mmRounder]
+      psrlw   mm1,2
+      packuswb mm1,mm0
+      movd    D [edi],mm1
+      // advance to next pixel
+      add     esi,4*2
+      add     edi,4
+      dec     ecx
+      jnz     pixLoopN
+      // advance to next row
+      lea     esi,[esi+ ebx*8] // skip one row in source mip-map
+      dec     edx
+      jnz     rowLoop
+      emms
+    }
+
+   #elif (defined __GNU_INLINE_X86_32__)
+    __asm__ __volatile__ (
+      "pxor    %%mm0, %%mm0                 \n\t"
+      "movl    %[pulSrcMipmap], %%esi       \n\t"
+      "movl    %[pulDstMipmap], %%edi       \n\t"
+      "movl    %[pixHeight], %%edx          \n\t"
+
+      "0:                                   \n\t"  // rowLoop
+      "movl    %[pixWidth], %%ecx           \n\t"
+
+      "1:                                   \n\t"  // pixLoopN
+      "movd      0(%%esi), %%mm1            \n\t"  // up-left
+      "movd      4(%%esi), %%mm2            \n\t"  // up-right
+      "movd      0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left
+      "movd      4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right
+      "punpcklbw %%mm0, %%mm1               \n\t"
+      "punpcklbw %%mm0, %%mm2               \n\t"
+      "punpcklbw %%mm0, %%mm3               \n\t"
+      "punpcklbw %%mm0, %%mm4               \n\t"
+      "paddw     %%mm2, %%mm1               \n\t"
+      "paddw     %%mm3, %%mm1               \n\t"
+      "paddw     %%mm4, %%mm1               \n\t"
+      "paddw     (" ASMSYM(mmRounder) "), %%mm1 \n\t"
+      "psrlw     $2, %%mm1                  \n\t"
+      "packuswb  %%mm0, %%mm1               \n\t"
+      "movd      %%mm1, (%%edi)             \n\t"
+
+      // advance to next pixel
+      "addl     $8, %%esi                   \n\t"
+      "addl     $4, %%edi                   \n\t"
+      "decl     %%ecx                       \n\t"
+      "jnz      1b                          \n\t"  // pixLoopN
+
+      // advance to next row
+      // skip one row in source mip-map
+      "leal     0(%%esi, %[pixWidth], 8), %%esi \n\t"
+      "decl     %%edx                       \n\t"
+      "jnz      0b                          \n\t"  // rowLoop
+      "emms                                 \n\t"
+          : // no outputs.
+          : [pixWidth] "r" (pixWidth),
+            [pulSrcMipmap] "g" (pulSrcMipmap),
+            [pulDstMipmap] "g" (pulDstMipmap),
+            [pixHeight] "g" (pixHeight)
+          : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi",
+            "cc", "memory"
+    );
+
+   #else
 	UBYTE *src = (UBYTE *) pulSrcMipmap;
 	UBYTE *dest = (UBYTE *) pulDstMipmap;
 	for (int i = 0 ; i < pixHeight; i++)
@ -260,129 +345,13 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
 		src += 8*pixWidth;
    }

-   #elif (defined __MSVC_INLINE__)
-    __asm {
-      pxor    mm0,mm0
-      mov     ebx,D [pixWidth]
-      mov     esi,D [pulSrcMipmap]
-      mov     edi,D [pulDstMipmap]
-      mov     edx,D [pixHeight]
-rowLoop:
-      mov     ecx,D [pixWidth]
-pixLoopN:           
-      movd    mm1,D [esi+ 0]        // up-left
-      movd    mm2,D [esi+ 4]        // up-right
-      movd    mm3,D [esi+ ebx*8 +0] // down-left
-      movd    mm4,D [esi+ ebx*8 +4] // down-right
-      punpcklbw mm1,mm0
-      punpcklbw mm2,mm0
-      punpcklbw mm3,mm0
-      punpcklbw mm4,mm0
-      paddw   mm1,mm2
-      paddw   mm1,mm3
-      paddw   mm1,mm4
-      paddw   mm1,Q [mmRounder]
-      psrlw   mm1,2
-      packuswb mm1,mm0
-      movd    D [edi],mm1
-      // advance to next pixel
-      add     esi,4*2
-      add     edi,4
-      dec     ecx
-      jnz     pixLoopN
-      // advance to next row
-      lea     esi,[esi+ ebx*8] // skip one row in source mip-map
-      dec     edx
-      jnz     rowLoop
-      emms
-    }
-
-   #elif (defined __GNU_INLINE__)
-    __asm__ __volatile__ (
-      "pxor    %%mm0, %%mm0                 \n\t"
-      "movl    %[pulSrcMipmap], %%esi       \n\t"
-      "movl    %[pulDstMipmap], %%edi       \n\t"
-      "movl    %[pixHeight], %%edx          \n\t"
-
-      "0:                                   \n\t"  // rowLoop
-      "movl    %[pixWidth], %%ecx           \n\t"
-
-      "1:                                   \n\t"  // pixLoopN
-      "movd      0(%%esi), %%mm1            \n\t"  // up-left
-      "movd      4(%%esi), %%mm2            \n\t"  // up-right
-      "movd      0(%%esi, %[pixWidth], 8), %%mm3 \n\t" // down-left
-      "movd      4(%%esi, %[pixWidth], 8), %%mm4 \n\t" // down-right
-      "punpcklbw %%mm0, %%mm1               \n\t"
-      "punpcklbw %%mm0, %%mm2               \n\t"
-      "punpcklbw %%mm0, %%mm3               \n\t"
-      "punpcklbw %%mm0, %%mm4               \n\t"
-      "paddw     %%mm2, %%mm1               \n\t"
-      "paddw     %%mm3, %%mm1               \n\t"
-      "paddw     %%mm4, %%mm1               \n\t"
-      "paddw     (" ASMSYM(mmRounder) "), %%mm1 \n\t"
-      "psrlw     $2, %%mm1                  \n\t"
-      "packuswb  %%mm0, %%mm1               \n\t"
-      "movd      %%mm1, (%%edi)             \n\t"
-
-      // advance to next pixel
-      "addl     $8, %%esi                   \n\t"
-      "addl     $4, %%edi                   \n\t"
-      "decl     %%ecx                       \n\t"
-      "jnz      1b                          \n\t"  // pixLoopN
-
-      // advance to next row
-      // skip one row in source mip-map
-      "leal     0(%%esi, %[pixWidth], 8), %%esi \n\t"
-      "decl     %%edx                       \n\t"
-      "jnz      0b                          \n\t"  // rowLoop
-      "emms                                 \n\t"
-          : // no outputs.
-          : [pixWidth] "r" (pixWidth),
-            [pulSrcMipmap] "g" (pulSrcMipmap),
-            [pulDstMipmap] "g" (pulDstMipmap),
-            [pixHeight] "g" (pixHeight)
-          : FPU_REGS, MMX_REGS, "ecx", "edx", "esi", "edi",
-            "cc", "memory"
-    );
-
-   #else
-     #error Write inline asm for your platform.
   #endif
    }
    else
    { // NEAREST-NEIGHBOUR but with border preserving
       ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL;

-   #if (defined USE_PORTABLE_C)
-
-     PIX offset = 0;
-     ulRowModulo /= 4;
-
-     for (int q = 0; q < 2; q++)
-     {
-         for (PIX i = pixHeight / 2; i > 0; i--)
-         {
-             for (PIX j = pixWidth / 2; j > 0; j--)
-             {
-                 *pulDstMipmap = *(pulSrcMipmap + offset);
-                 pulSrcMipmap += 2;
-                 pulDstMipmap++;
-             }
-
-             for (PIX j = pixWidth / 2; j > 0; j--)
-             {
-                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
-                 pulSrcMipmap += 2;
-                 pulDstMipmap++;
-             }
-
-             pulSrcMipmap += ulRowModulo;
-        }
-
-        offset = pixWidth * 2;
-     }
-
-   #elif (defined __MSVC_INLINE__)
+   #if (defined __MSVC_INLINE__)
    __asm {
      xor     ebx,ebx
      mov     esi,D [pulSrcMipmap]
@ -428,7 +397,7 @@ halfEnd:
 fullEnd:
    }

-   #elif (defined __GNU_INLINE__)
+   #elif (defined __GNU_INLINE_X86_32__)
    ULONG tmp, tmp2;
    __asm__ __volatile__ (
      "xorl     %[xbx], %[xbx]             \n\t"
@ -493,7 +462,33 @@ fullEnd:
    );

   #else
-     #error Write inline asm for your platform.
+     PIX offset = 0;
+     ulRowModulo /= 4;
+
+     for (int q = 0; q < 2; q++)
+     {
+         for (PIX i = pixHeight / 2; i > 0; i--)
+         {
+             for (PIX j = pixWidth / 2; j > 0; j--)
+             {
+                 *pulDstMipmap = *(pulSrcMipmap + offset);
+                 pulSrcMipmap += 2;
+                 pulDstMipmap++;
+             }
+
+             for (PIX j = pixWidth / 2; j > 0; j--)
+             {
+                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
+                 pulSrcMipmap += 2;
+                 pulDstMipmap++;
+             }
+
+             pulSrcMipmap += ulRowModulo;
+        }
+
+        offset = pixWidth * 2;
+     }
+
   #endif
  }
 }
@ -649,7 +644,7 @@ __int64 mmShifter = 0;
 __int64 mmMask  = 0;
 ULONG *pulDitherTable;

-#ifdef USE_PORTABLE_C
+#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 extern const UBYTE *pubClipByte;
 // increment a byte without overflowing it
 static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd)
@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth
 // ------------------------------- ordered matrix dithering routine

 ditherOrder:
-#if (defined USE_PORTABLE_C)
-  union uConv
-  {
-    ULONG val;
-    DWORD dwords[2];
-    UWORD words[4];
-    WORD  iwords[4];
-    UBYTE bytes[8];
-  };
-  for (int i=0; i<pixHeight; i++) {
-    int idx = i&3;
-    uConv dith;
-    dith.val = pulDitherTable[idx];
-    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
-    dith.val &= mmMask;
-    uConv* src = (uConv*)(pulSrc+i*pixWidth);
-    uConv* dst = (uConv*)(pulDst+i*pixWidth);
-    for (int j=0; j<pixWidth; j+=2) {
-      uConv p=src[0];
-      for (int k=0; k<8; k++) {
-        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
-      }
-      dst[0] = p;
-      src++;
-      dst++;
-    }
-  }
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pulSrc]
    mov     edi,D [pulDst]
@ -852,7 +819,7 @@ nextRowO:
    emms;
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp;
  __asm__ __volatile__ (
    "movl     %[pulSrc], %%esi           \n\t"
@ -912,7 +879,33 @@ nextRowO:
  );

 #else
-  #error Write inline asm for your platform.
+  union uConv
+  {
+    ULONG val;
+    DWORD dwords[2];
+    UWORD words[4];
+    WORD  iwords[4];
+    UBYTE bytes[8];
+  };
+  for (int i=0; i<pixHeight; i++) {
+    int idx = i&3;
+    uConv dith;
+    dith.val = pulDitherTable[idx];
+    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
+    dith.val &= mmMask;
+    uConv* src = (uConv*)(pulSrc+i*pixWidth);
+    uConv* dst = (uConv*)(pulDst+i*pixWidth);
+    for (int j=0; j<pixWidth; j+=2) {
+      uConv p=src[0];
+      for (int k=0; k<8; k++) {
+        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
+      }
+      dst[0] = p;
+      src++;
+      dst++;
+    }
+  }
+
 #endif

  goto theEnd;
@ -924,34 +917,7 @@ ditherError:
  if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
  // slModulo+=4;
  // now, dither destination
-#if (defined USE_PORTABLE_C)
-  #if 1 //SEB doesn't works....
-  for (int i=0; i<pixHeight-1; i++) {
-    int step = (i&1)?-4:+4;
-    const UBYTE ubMask = (mmErrDiffMask&0xff);
-    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
-    if(i&1) src+=pixWidth*4;
-    // left to right or right to left
-    for (int j=0; j<pixWidth-1; j++) {
-      uConv p1, p3, p5, p7;
-      src+=step;
-      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
-      //p1.val &= mmErrDiffMask;
-      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
-                                p5.words[k] = (p1.words[k]*5)>>4;
-                                p7.words[k] = (p1.words[k]*7)>>4; }
-      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
-      for (int k=0; k<4; k++) { 
-        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
-      }
-    }
-  }
-  #endif
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    mov     esi,D [pulDst]
@ -1046,7 +1012,7 @@ allDoneE:
    emms;
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "pxor    %%mm0, %%mm0                 \n\t"
    "movl    %[pulDst], %%esi             \n\t"
@ -1157,7 +1123,32 @@ allDoneE:
  );

 #else
-  #error Write inline asm for your platform.
+  #if 1 //SEB doesn't works....
+  for (int i=0; i<pixHeight-1; i++) {
+    int step = (i&1)?-4:+4;
+    const UBYTE ubMask = (mmErrDiffMask&0xff);
+    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
+    if(i&1) src+=pixWidth*4;
+    // left to right or right to left
+    for (int j=0; j<pixWidth-1; j++) {
+      uConv p1, p3, p5, p7;
+      src+=step;
+      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
+      //p1.val &= mmErrDiffMask;
+      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
+                                p5.words[k] = (p1.words[k]*5)>>4;
+                                p7.words[k] = (p1.words[k]*7)>>4; }
+      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
+      for (int k=0; k<4; k++) { 
+        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
+      }
+    }
+  }
+  #endif
+
 #endif

  goto theEnd;
@ -1265,7 +1256,7 @@ extern "C" {
 }


-#ifdef USE_PORTABLE_C
+#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 typedef SWORD ExtPix[4];

 static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI
    _mm_empty();  // we're done, clear out the MMX registers!


-#elif (defined USE_PORTABLE_C)
-    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
-    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
-
-    ULONG *src = pulSrc;
-    ULONG *dst = pulDst;
-    ULONG *rowptr = aulRows;
-
-    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
-    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
-    EXTPIXFROMINT64(mmCm);
-    EXTPIXFROMINT64(mmCe);
-    EXTPIXFROMINT64(mmCc);
-    EXTPIXFROMINT64(mmEch);
-    EXTPIXFROMINT64(mmEcl);
-    EXTPIXFROMINT64(mmEe);
-    EXTPIXFROMINT64(mmEm);
-    EXTPIXFROMINT64(mmMm);
-    EXTPIXFROMINT64(mmMe);
-    EXTPIXFROMINT64(mmMc);
-    EXTPIXFROMINT64(mmAdd);
-    EXTPIXFROMINT64(mmInvDiv);
-    #undef EXTPIXFROMINT64
-
-    // ----------------------- process upper left corner
-    extend_pixel(src[0], rmm1);
-    extend_pixel(src[1], rmm2);
-    extend_pixel(src[pixCanvasWidth], rmm3);
-    extend_pixel(src[pixCanvasWidth+1], rmm4);
-
-    extpix_add(rmm2, rmm3);
-    extpix_mul(rmm1, rmmCm);
-    extpix_mul(rmm2, rmmCe);
-    extpix_mul(rmm4, rmmCc);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm4);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    *(rowptr++) = unextend_pixel(rmm1);
-    
-    src++;
-
-    // ----------------------- process upper edge pixels
-    for (PIX i = pixWidth - 2; i != 0; i--)
-    {
-        extend_pixel(src[-1], rmm1);
-        extend_pixel(src[0], rmm2);
-        extend_pixel(src[1], rmm3);
-        extend_pixel(src[pixCanvasWidth-1], rmm4);
-        extend_pixel(src[pixCanvasWidth], rmm5);
-        extend_pixel(src[pixCanvasWidth+1], rmm6);
-
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm4, rmm6);
-        extpix_mul(rmm1, rmmEch);
-        extpix_mul(rmm2, rmmEm);
-        extpix_mul(rmm4, rmmEcl);
-        extpix_mul(rmm5, rmmEe);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm4);
-        extpix_add(rmm1, rmm5);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        *(rowptr++) = unextend_pixel(rmm1);
-        src++;
-    }
-
-    // ----------------------- process upper right corner
-
-    extend_pixel(src[-1], rmm1);
-    extend_pixel(src[0], rmm2);
-    extend_pixel(src[pixCanvasWidth-1], rmm3);
-    extend_pixel(src[pixCanvasWidth], rmm4);
-
-    extpix_add(rmm1, rmm4);
-    extpix_mul(rmm1, rmmCe);
-    extpix_mul(rmm2, rmmCm);
-    extpix_mul(rmm3, rmmCc);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm3);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    *rowptr = unextend_pixel(rmm1);
-
-// ----------------------- process bitmap middle pixels
-
-    dst += slCanvasWidth;
-    src += slModulo1;
-
-    // for each row
-    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
-    {
-        rowptr = aulRows;
-
-        // process left edge pixel
-        extend_pixel(src[-pixCanvasWidth], rmm1);
-        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
-        extend_pixel(src[0], rmm3);
-        extend_pixel(src[1], rmm4);
-        extend_pixel(src[pixCanvasWidth], rmm5);
-        extend_pixel(src[pixCanvasWidth+1], rmm6);
-
-        extpix_add(rmm1, rmm5);
-        extpix_add(rmm2, rmm6);
-        extpix_mul(rmm1, rmmEch);
-        extpix_mul(rmm2, rmmEcl);
-        extpix_mul(rmm3, rmmEm);
-        extpix_mul(rmm4, rmmEe);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm1, rmm4);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        *(rowptr++) = unextend_pixel(rmm1);
-        src++;
-        dst++;
-
-        // for each pixel in current row
-        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
-        {
-            // prepare upper convolution row
-            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-            extend_pixel(src[-pixCanvasWidth], rmm2);
-            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
-
-            // prepare middle convolution row
-            extend_pixel(src[-1], rmm4);
-            extend_pixel(src[0], rmm5);
-            extend_pixel(src[1], rmm6);
-
-            // free some registers
-            extpix_add(rmm1, rmm3);
-            extpix_add(rmm2, rmm4);
-            extpix_mul(rmm5, rmmMm);
-
-            // prepare lower convolution row
-            extend_pixel(src[pixCanvasWidth-1], rmm3);
-            extend_pixel(src[pixCanvasWidth], rmm4);
-            extend_pixel(src[pixCanvasWidth+1], rmm7);
-
-            // calc weightened value
-            extpix_add(rmm2, rmm6);
-            extpix_add(rmm1, rmm3);
-            extpix_add(rmm2, rmm4);
-            extpix_add(rmm1, rmm7);
-            extpix_mul(rmm2, rmmMe);
-            extpix_mul(rmm1, rmmMc);
-            extpix_add(rmm2, rmm5);
-            extpix_add(rmm1, rmm2);
-
-            // calc and store wightened value
-            extpix_adds(rmm1, rmmAdd);
-            extpix_mulhi(rmm1, rmmInvDiv);
-            dst[-pixCanvasWidth] = *rowptr;
-            *(rowptr++) = unextend_pixel(rmm1);
-
-            // advance to next pixel
-            src++;
-            dst++;
-        }
-
-        // process right edge pixel
-        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-        extend_pixel(src[-pixCanvasWidth], rmm2);
-        extend_pixel(src[-1], rmm3);
-        extend_pixel(src[0], rmm4);
-        extend_pixel(src[pixCanvasWidth-1], rmm5);
-        extend_pixel(src[pixCanvasWidth], rmm6);
-
-        extpix_add(rmm1, rmm5);
-        extpix_add(rmm2, rmm6);
-        extpix_mul(rmm1, rmmEcl);
-        extpix_mul(rmm2, rmmEch);
-        extpix_mul(rmm3, rmmEe);
-        extpix_mul(rmm4, rmmEm);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm1, rmm4);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        *rowptr = unextend_pixel(rmm1);
-
-        // advance to next row
-        src += slModulo1;
-        dst += slModulo1;
-    }
-
-    // ----------------------- process lower left corner
-    rowptr = aulRows;
-    extend_pixel(src[-pixCanvasWidth], rmm1);
-    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
-    extend_pixel(src[0], rmm3);
-    extend_pixel(src[1], rmm4);
-
-    extpix_add(rmm1, rmm4);
-    extpix_mul(rmm1, rmmCe);
-    extpix_mul(rmm2, rmmCc);
-    extpix_mul(rmm3, rmmCm);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm3);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    dst[-pixCanvasWidth] = *rowptr;
-    dst[0] = unextend_pixel(rmm1);
-
-    src++;
-    dst++;
-    rowptr++;
-
-    // ----------------------- process lower edge pixels
-    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
-    {
-        // for each pixel
-        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-        extend_pixel(src[-pixCanvasWidth], rmm2);
-        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
-        extend_pixel(src[-1], rmm4);
-        extend_pixel(src[0], rmm5);
-        extend_pixel(src[1], rmm6);
-
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm4, rmm6);
-        extpix_mul(rmm1, rmmEcl);
-        extpix_mul(rmm2, rmmEe);
-        extpix_mul(rmm4, rmmEch);
-        extpix_mul(rmm5, rmmEm);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm4);
-        extpix_add(rmm1, rmm5);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        dst[0] = unextend_pixel(rmm1);
-
-        // advance to next pixel
-        src++;
-        dst++;
-        rowptr++;
-    }
-
-    // ----------------------- lower right corners
-    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-    extend_pixel(src[-pixCanvasWidth], rmm2);
-    extend_pixel(src[-1], rmm3);
-    extend_pixel(src[0], rmm4);
-
-    extpix_add(rmm2, rmm3);
-    extpix_mul(rmm1, rmmCc);
-    extpix_mul(rmm2, rmmCe);
-    extpix_mul(rmm4, rmmCm);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm4);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    dst[-pixCanvasWidth] = *rowptr;
-    dst[0] = unextend_pixel(rmm1);
-
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
@ -2204,7 +1936,7 @@ lowerLoop:
    emms
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)

  FB_pulSrc = pulSrc;
  FB_pulDst = pulDst;
@ -2537,7 +2269,264 @@ lowerLoop:
  );

 #else
-  #error Write inline asm for your platform.
+    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
+    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
+
+    ULONG *src = pulSrc;
+    ULONG *dst = pulDst;
+    ULONG *rowptr = aulRows;
+
+    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
+    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
+    EXTPIXFROMINT64(mmCm);
+    EXTPIXFROMINT64(mmCe);
+    EXTPIXFROMINT64(mmCc);
+    EXTPIXFROMINT64(mmEch);
+    EXTPIXFROMINT64(mmEcl);
+    EXTPIXFROMINT64(mmEe);
+    EXTPIXFROMINT64(mmEm);
+    EXTPIXFROMINT64(mmMm);
+    EXTPIXFROMINT64(mmMe);
+    EXTPIXFROMINT64(mmMc);
+    EXTPIXFROMINT64(mmAdd);
+    EXTPIXFROMINT64(mmInvDiv);
+    #undef EXTPIXFROMINT64
+
+    // ----------------------- process upper left corner
+    extend_pixel(src[0], rmm1);
+    extend_pixel(src[1], rmm2);
+    extend_pixel(src[pixCanvasWidth], rmm3);
+    extend_pixel(src[pixCanvasWidth+1], rmm4);
+
+    extpix_add(rmm2, rmm3);
+    extpix_mul(rmm1, rmmCm);
+    extpix_mul(rmm2, rmmCe);
+    extpix_mul(rmm4, rmmCc);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm4);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    *(rowptr++) = unextend_pixel(rmm1);
+    
+    src++;
+
+    // ----------------------- process upper edge pixels
+    for (PIX i = pixWidth - 2; i != 0; i--)
+    {
+        extend_pixel(src[-1], rmm1);
+        extend_pixel(src[0], rmm2);
+        extend_pixel(src[1], rmm3);
+        extend_pixel(src[pixCanvasWidth-1], rmm4);
+        extend_pixel(src[pixCanvasWidth], rmm5);
+        extend_pixel(src[pixCanvasWidth+1], rmm6);
+
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm4, rmm6);
+        extpix_mul(rmm1, rmmEch);
+        extpix_mul(rmm2, rmmEm);
+        extpix_mul(rmm4, rmmEcl);
+        extpix_mul(rmm5, rmmEe);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm4);
+        extpix_add(rmm1, rmm5);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        *(rowptr++) = unextend_pixel(rmm1);
+        src++;
+    }
+
+    // ----------------------- process upper right corner
+
+    extend_pixel(src[-1], rmm1);
+    extend_pixel(src[0], rmm2);
+    extend_pixel(src[pixCanvasWidth-1], rmm3);
+    extend_pixel(src[pixCanvasWidth], rmm4);
+
+    extpix_add(rmm1, rmm4);
+    extpix_mul(rmm1, rmmCe);
+    extpix_mul(rmm2, rmmCm);
+    extpix_mul(rmm3, rmmCc);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm3);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    *rowptr = unextend_pixel(rmm1);
+
+// ----------------------- process bitmap middle pixels
+
+    dst += slCanvasWidth;
+    src += slModulo1;
+
+    // for each row
+    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
+    {
+        rowptr = aulRows;
+
+        // process left edge pixel
+        extend_pixel(src[-pixCanvasWidth], rmm1);
+        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
+        extend_pixel(src[0], rmm3);
+        extend_pixel(src[1], rmm4);
+        extend_pixel(src[pixCanvasWidth], rmm5);
+        extend_pixel(src[pixCanvasWidth+1], rmm6);
+
+        extpix_add(rmm1, rmm5);
+        extpix_add(rmm2, rmm6);
+        extpix_mul(rmm1, rmmEch);
+        extpix_mul(rmm2, rmmEcl);
+        extpix_mul(rmm3, rmmEm);
+        extpix_mul(rmm4, rmmEe);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm1, rmm4);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        *(rowptr++) = unextend_pixel(rmm1);
+        src++;
+        dst++;
+
+        // for each pixel in current row
+        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
+        {
+            // prepare upper convolution row
+            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+            extend_pixel(src[-pixCanvasWidth], rmm2);
+            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
+
+            // prepare middle convolution row
+            extend_pixel(src[-1], rmm4);
+            extend_pixel(src[0], rmm5);
+            extend_pixel(src[1], rmm6);
+
+            // free some registers
+            extpix_add(rmm1, rmm3);
+            extpix_add(rmm2, rmm4);
+            extpix_mul(rmm5, rmmMm);
+
+            // prepare lower convolution row
+            extend_pixel(src[pixCanvasWidth-1], rmm3);
+            extend_pixel(src[pixCanvasWidth], rmm4);
+            extend_pixel(src[pixCanvasWidth+1], rmm7);
+
+            // calc weightened value
+            extpix_add(rmm2, rmm6);
+            extpix_add(rmm1, rmm3);
+            extpix_add(rmm2, rmm4);
+            extpix_add(rmm1, rmm7);
+            extpix_mul(rmm2, rmmMe);
+            extpix_mul(rmm1, rmmMc);
+            extpix_add(rmm2, rmm5);
+            extpix_add(rmm1, rmm2);
+
+            // calc and store wightened value
+            extpix_adds(rmm1, rmmAdd);
+            extpix_mulhi(rmm1, rmmInvDiv);
+            dst[-pixCanvasWidth] = *rowptr;
+            *(rowptr++) = unextend_pixel(rmm1);
+
+            // advance to next pixel
+            src++;
+            dst++;
+        }
+
+        // process right edge pixel
+        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+        extend_pixel(src[-pixCanvasWidth], rmm2);
+        extend_pixel(src[-1], rmm3);
+        extend_pixel(src[0], rmm4);
+        extend_pixel(src[pixCanvasWidth-1], rmm5);
+        extend_pixel(src[pixCanvasWidth], rmm6);
+
+        extpix_add(rmm1, rmm5);
+        extpix_add(rmm2, rmm6);
+        extpix_mul(rmm1, rmmEcl);
+        extpix_mul(rmm2, rmmEch);
+        extpix_mul(rmm3, rmmEe);
+        extpix_mul(rmm4, rmmEm);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm1, rmm4);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        *rowptr = unextend_pixel(rmm1);
+
+        // advance to next row
+        src += slModulo1;
+        dst += slModulo1;
+    }
+
+    // ----------------------- process lower left corner
+    rowptr = aulRows;
+    extend_pixel(src[-pixCanvasWidth], rmm1);
+    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
+    extend_pixel(src[0], rmm3);
+    extend_pixel(src[1], rmm4);
+
+    extpix_add(rmm1, rmm4);
+    extpix_mul(rmm1, rmmCe);
+    extpix_mul(rmm2, rmmCc);
+    extpix_mul(rmm3, rmmCm);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm3);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    dst[-pixCanvasWidth] = *rowptr;
+    dst[0] = unextend_pixel(rmm1);
+
+    src++;
+    dst++;
+    rowptr++;
+
+    // ----------------------- process lower edge pixels
+    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
+    {
+        // for each pixel
+        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+        extend_pixel(src[-pixCanvasWidth], rmm2);
+        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
+        extend_pixel(src[-1], rmm4);
+        extend_pixel(src[0], rmm5);
+        extend_pixel(src[1], rmm6);
+
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm4, rmm6);
+        extpix_mul(rmm1, rmmEcl);
+        extpix_mul(rmm2, rmmEe);
+        extpix_mul(rmm4, rmmEch);
+        extpix_mul(rmm5, rmmEm);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm4);
+        extpix_add(rmm1, rmm5);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        dst[0] = unextend_pixel(rmm1);
+
+        // advance to next pixel
+        src++;
+        dst++;
+        rowptr++;
+    }
+
+    // ----------------------- lower right corners
+    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+    extend_pixel(src[-pixCanvasWidth], rmm2);
+    extend_pixel(src[-1], rmm3);
+    extend_pixel(src[0], rmm4);
+
+    extpix_add(rmm2, rmm3);
+    extpix_mul(rmm1, rmmCc);
+    extpix_mul(rmm2, rmmCe);
+    extpix_mul(rmm4, rmmCm);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm4);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    dst[-pixCanvasWidth] = *rowptr;
+    dst[0] = unextend_pixel(rmm1);
+
 #endif

  // all done (finally)
--- a/Sources/Engine/Graphics/OpenGL.h
+++ b/Sources/Engine/Graphics/OpenGL.h
@ -89,20 +89,14 @@ extern void  (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param);
 inline void glCOLOR( COLOR col)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
-	col = ( ((col << 24)            ) |
-            ((col << 8) & 0x00FF0000) |
-            ((col >> 8) & 0x0000FF00) |
-            ((col >> 24)            ) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,dword ptr [col]
    bswap   eax
    mov     dword ptr [col],eax
  }

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "bswapl   %%eax    \n\t"
        : "=a" (col)
@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col)
  );

 #else
-  #error please define for your platform.
+  col = ( ((col << 24)            ) |
+          ((col << 8) & 0x00FF0000) |
+          ((col >> 8) & 0x0000FF00) |
+          ((col >> 24)            ) );
+
 #endif

  pglColor4ubv((GLubyte*)&col);
--- a/Sources/Engine/Graphics/TextureEffects.cpp
+++ b/Sources/Engine/Graphics/TextureEffects.cpp
@ -32,11 +32,9 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-#define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
 #define ASMOPT 1
-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
 #define ASMOPT 1
 #else
 #define ASMOPT 0
@ -1285,7 +1283,6 @@ static void RenderWater(void)
  { // SUB-SAMPLING
    SLONG slHeightMapStep, slHeightRowStep;

-#if ASMOPT == 1
 #if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
@ -1357,7 +1354,7 @@ pixLoop:
      pop     ebx
    }

-  #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
    // rcg12152001 needed extra registers. :(
    _slHeightMapStep_renderWater = slHeightMapStep;
    _pixBaseWidth_renderWater = pixBaseWidth;
@ -1460,10 +1457,6 @@ pixLoop:
          "cc", "memory"
    );

-  #else
-    #error fill in for your platform.
-  #endif
-
 #else

    PIX pixPos, pixDU, pixDV;
@ -1626,7 +1619,7 @@ pixLoop2:
      pop     ebx
    }

-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
    __asm__ __volatile__ (
      "bsfl      %[pixBaseWidth], %%eax             \n\t"
      "movl      $32, %%edx                         \n\t"
@ -2146,7 +2139,7 @@ pixLoop4:
      pop     ebx
    }

-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
    __asm__ __volatile__ (
      "bsfl      %[pixBaseWidth], %%eax             \n\t"
      "movl      $32, %%edx                         \n\t"
@ -2976,7 +2969,7 @@ pixDone:
    pop     ebx
  }

- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[slColumnModulo], %%edx             \n\t"
    "movl    %[slBufferMask], %%ecx               \n\t"
@ -3119,7 +3112,7 @@ pixLoopF:
    jnz     rowLoopF
    pop     ebx
  }
- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  _pubHeat_RenderPlasmaFire = pubHeat;  // ran out of registers.  :/
  __asm__ __volatile__ (
    "movl    %[slHeatRowStep], %%eax     \n\t"
--- a/Sources/Engine/Light/LayerMixer.cpp
+++ b/Sources/Engine/Light/LayerMixer.cpp
@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-  #define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
-  #define ASMOPT 1
-#elif (defined __GNU_INLINE__)
-  #define ASMOPT 1
-#else
-  #define ASMOPT 0
-#endif
-
 extern INDEX shd_bFineQuality;
 extern INDEX shd_iFiltering;
 extern INDEX shd_iDithering;
@ -290,7 +280,6 @@ void CLayerMixer::AddAmbientPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if (ASMOPT == 1)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
@ -364,7 +353,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -439,10 +428,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );

- #else
-  #error Write inline asm for your platform.
- #endif
-
 #else

    // !!! FIXME WARNING: I have not checked this code, and it could be
@ -496,7 +481,6 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightStep>>=1;


-#if (ASMOPT == 1)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
@ -576,7 +560,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -660,10 +644,6 @@ skipPixel:
          "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
-
 #else   // Portable C version...

  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -723,7 +703,6 @@ void CLayerMixer::AddDiffusionPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if ASMOPT == 1
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
@ -796,7 +775,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -871,10 +850,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );

- #else
-  #error Write inline assembly for your platform.
- #endif
-
 #else
  // for each pixel in the shadow map
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -929,7 +904,6 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if (ASMOPT == 1)
 #if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
@ -1008,7 +982,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -1091,11 +1065,6 @@ skipPixel:
          "cc", "memory"
  );

- #else
-  #error Write inline ASM for your platform.
-
- #endif
-
 #else

  // for each pixel in the shadow map
@ -1201,7 +1170,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
  FLOAT fDL2oDV     = fDDL2oDV + 2*(lm_vStepV%v00);
  //_v00 = v00;

-#if ((ASMOPT == 1) && (!defined __GNU_INLINE__))
 #if (defined __MSVC_INLINE__)
  __asm {
    fld     D [fDDL2oDU]
@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
    fistp   D [_slDDL2oDV]
    fistp   D [_slDDL2oDU]
  }
- #elif (defined __GNU_INLINE__)
-    STUBBED("inline asm.");
- #else
-   #error Please write inline assembly for your platform.
- #endif
-
 #else
  fDDL2oDU     *= 2;
  fDDL2oDV     *= 2;
@ -1321,7 +1283,6 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp)
  _pulLayer  = lm_pulShadowMap;
  FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f);

-#if ((ASMOPT == 1) && (!defined __GNU_INLINE__))
 #if (defined __MSVC_INLINE__)
  __int64 mmRowAdv;
  SLONG fixGRow  = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15
@ -1436,14 +1397,6 @@ rowNext:
 rowDone:
    emms
  }
- #elif (defined __GNU_INLINE__)
-
-    STUBBED("WRITE ME. Argh.");
-
- #else
-  #error Need inline assembly for your platform.
- #endif
-
 #else
  // well, make gradient ...
  SLONG slR0=0,slG0=0,slB0=0;
@ -1528,9 +1481,8 @@ rowDone:
 // apply directional light or ambient to layer
 void CLayerMixer::AddDirectional(void)
 {
-#if ASMOPT == 1
-  ULONG ulLight = ByteSwap( lm_colLight);
 #if (defined __MSVC_INLINE__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  __asm {
    // prepare pointers and variables
    mov     edi,D [_pulLayer]
@ -1565,7 +1517,8 @@ rowNext:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1608,10 +1561,6 @@ rowNext:
        : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory"
  );

- #else
-   #error Write inline assembly for your platform.
- #endif
-
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1631,9 +1580,8 @@ rowNext:
 // apply directional light thru mask to layer
 void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask)
 {
-#if ASMOPT == 1
-  ULONG ulLight = ByteSwap( lm_colLight);
 #if (defined __MSVC_INLINE__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  // prepare some local variables
  __asm {
    // prepare pointers and variables
@ -1665,7 +1613,8 @@ skipLight:
    emms
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1706,10 +1655,6 @@ skipLight:
          "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
-
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1832,7 +1777,33 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    }
  } // set initial color

- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
+  __asm {
+    cld
+    mov     ebx,D [this]
+    mov     ecx,D [ebx].lm_pixCanvasSizeU
+    imul    ecx,D [ebx].lm_pixCanvasSizeV
+    mov     edi,D [ebx].lm_pulShadowMap
+    mov     eax,D [colAmbient]
+    bswap   eax
+    rep     stosd
+  }
+
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG clob1, clob2, clob3;
+  __asm__ __volatile__ (
+    "cld                    \n\t"
+    "imull   %%esi, %%ecx   \n\t"
+    "bswapl  %%eax          \n\t"
+    "rep                    \n\t"
+    "stosl                  \n\t"
+        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
+        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
+          "a" (colAmbient), "D" (this->lm_pulShadowMap)
+        : "cc", "memory"
+  );
+
+#else
  register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV;
  #if PLATFORM_LITTLEENDIAN
  // Forces C fallback; BYTESWAP itself is a no-op on little endian.
@ -1850,34 +1821,6 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    ptr++;
  }

- #elif (defined __MSVC_INLINE__)
-  __asm {
-    cld
-    mov     ebx,D [this]
-    mov     ecx,D [ebx].lm_pixCanvasSizeU
-    imul    ecx,D [ebx].lm_pixCanvasSizeV
-    mov     edi,D [ebx].lm_pulShadowMap
-    mov     eax,D [colAmbient]
-    bswap   eax
-    rep     stosd
-  }
-
- #elif (defined __GNU_INLINE__)
-  ULONG clob1, clob2, clob3;
-  __asm__ __volatile__ (
-    "cld                    \n\t"
-    "imull   %%esi, %%ecx   \n\t"
-    "bswapl  %%eax          \n\t"
-    "rep                    \n\t"
-    "stosl                  \n\t"
-        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
-        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
-          "a" (colAmbient), "D" (this->lm_pulShadowMap)
-        : "cc", "memory"
-  );
-
- #else
-  #error Please write inline assembly for your platform.
 #endif

  _pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL);
@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::CopyShadowLayer(void)
 {
- #if (defined USE_PORTABLE_C)
-   memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -1967,7 +1908,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
    mov     edi,D [ebx].lm_pulShadowMap
    rep     movsd
  }
- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -1981,7 +1922,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
  );

 #else
-  #error Please write inline assembly for your platform.
+  memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
 #endif
 }

@ -1989,12 +1930,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
 {
- #if (defined USE_PORTABLE_C)
-   DWORD* dst = (DWORD*)lm_pulShadowMap;
-   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
-   DWORD color = __builtin_bswap32(col);
-   while(n--) {*(dst++)=color;}
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -2006,7 +1942,7 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
    rep     stosd
  }

- #elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -2021,7 +1957,10 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
  );

 #else
-  #error Please write inline assembly for your platform.
+   DWORD* dst = (DWORD*)lm_pulShadowMap;
+   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
+   DWORD color = __builtin_bswap32(col);
+   while(n--) {*(dst++)=color;}
 #endif
 }

--- a/Sources/Engine/Math/Float.cpp
+++ b/Sources/Engine/Math/Float.cpp
@ -24,20 +24,11 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define _PC_64    0x0300

 // !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap.  --ryan.
-#ifdef USE_PORTABLE_C
-// Fake control87 for USE_PORTABLE_C version
-inline ULONG _control87(WORD newcw, WORD mask)
-{
-    static WORD fpw=_PC_64;
-    if (mask != 0)
-    {
-        fpw &= ~mask;
-        fpw |= (newcw & mask);
-    }
-    return(fpw);
-}
+#if (defined _MSC_VER)

-#elif (defined __GNU_INLINE__)
+// _control87 is provided by the compiler
+
+#elif (defined __GNU_INLINE_X86_32__)

 inline ULONG _control87(WORD newcw, WORD mask)
 {
@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask)
    return(fpw);
 }

-#elif (!defined _MSC_VER)
-#error Implement for your platform, or add a stub conditional here.
+#else
+
+// Fake control87 for USE_PORTABLE_C version
+inline ULONG _control87(WORD newcw, WORD mask)
+{
+    static WORD fpw=_PC_64;
+    if (mask != 0)
+    {
+        fpw &= ~mask;
+        fpw |= (newcw & mask);
+    }
+    return(fpw);
+}
+
 #endif

 /* Get current precision setting of FPU. */
--- a/Sources/Engine/Math/Functions.h
+++ b/Sources/Engine/Math/Functions.h
@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul)
 // fast float to int conversion
 inline SLONG FloatToInt( FLOAT f)
 {
-#if defined(__arm__) || defined(USE_PORTABLE_C)
-  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
-  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
-  return((SLONG) (f + addToRound));
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    fld    D [f]
@ -325,7 +320,7 @@ inline SLONG FloatToInt( FLOAT f)
  }
  return slRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  SLONG slRet;
  __asm__ __volatile__ (
    "flds     (%%eax)   \n\t"
@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f)
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
+  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
+  return((SLONG) (f + addToRound));
+
 #endif
 }

 // log base 2 of any float numero
 inline FLOAT Log2( FLOAT f) {
-#if (defined USE_PORTABLE_C) || defined(__arm__)
-  return log2f(f);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  FLOAT fRet;
  _asm {
    fld1
@ -355,7 +350,7 @@ inline FLOAT Log2( FLOAT f) {
  }
  return fRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  FLOAT fRet;
  __asm__ __volatile__ (
    "fld1               \n\t"
@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) {
  );
  return(fRet);
 #else
-  #error Fill this in for your platform.
+  return log2f(f);
+
 #endif
 }

@ -376,8 +372,24 @@ inline FLOAT Log2( FLOAT f) {
 // returns accurate values only for integers that are power of 2
 inline SLONG FastLog2( SLONG x)
 {
-#if (defined USE_PORTABLE_C)
-#ifdef __GNUC__
+#if (defined __MSVC_INLINE__)
+  SLONG slRet;
+  __asm {
+    bsr   eax,D [x]
+    mov   D [slRet],eax
+  }
+  return slRet;
+
+#elif (defined __GNU_INLINE_X86_32__)
+  SLONG slRet;
+  __asm__ __volatile__ (
+    "bsrl   %%ecx, %%eax      \n\t"
+        : "=a" (slRet)
+        : "c" (x)
+        : "memory"
+  );
+  return(slRet);
+#elif (defined __GNUC__)
  if(x == 0) return 0; // __builtin_clz() is undefined for 0
  int numLeadingZeros  = __builtin_clz(x);
  return 31 - numLeadingZeros;
@ -393,38 +405,13 @@ inline SLONG FastLog2( SLONG x)

  return 0;
 #endif
-
-#elif (defined __MSVC_INLINE__)
-  SLONG slRet;
-  __asm {
-    bsr   eax,D [x]
-    mov   D [slRet],eax
-  }
-  return slRet;
-
-#elif (defined __GNU_INLINE__)
-  SLONG slRet;
-  __asm__ __volatile__ (
-    "bsrl   %%ecx, %%eax      \n\t"
-        : "=a" (slRet)
-        : "c" (x)
-        : "memory"
-  );
-  return(slRet);
-#else
-  #error Fill this in for your platform.
-#endif
 }

 /* DG: function is unused => doesn't matter that portable implementation is not optimal :)
 // returns log2 of first larger value that is a power of 2
 inline SLONG FastMaxLog2( SLONG x)
 { 
-#if (defined USE_PORTABLE_C)
-printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
-  return((SLONG) log2((double) x));
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    bsr   eax,D [x]
@ -435,7 +422,7 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  }
  return slRet;

-#elif (defined __GNU_INLINE__)
+#elif (defined __GNU_INLINE_X86_32__)
  SLONG slRet;
  __asm__ __volatile__ (
    "bsrl  %%ecx, %%eax     \n\t"
@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
+  return((SLONG) log2((double) x));
+
 #endif
 }
 */
--- a/Sources/Engine/Models/RenderModel_View.cpp
+++ b/Sources/Engine/Models/RenderModel_View.cpp
@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined __MSVC_INLINE__)
-#define ASMOPT 1
-#elif (defined __GNU_INLINE__)
-#define ASMOPT 0  // !!! FIXME: rcg10112001 Write GCC inline asm versions...
-#else
-#define ASMOPT 0
-#endif
-

 extern BOOL CVA_bModels;
 extern BOOL GFX_bTruform;
@ -663,7 +655,7 @@ static FLOAT   _fHazeAdd;
 // check vertex against fog
 static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tex]
@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 // check vertex against haze
 static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tx1]
@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals)
    const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1;
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1196,7 +1188,7 @@ vtxNext16:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1365,7 +1357,7 @@ vtxNext16L:
    // if no lerping
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1464,7 +1456,7 @@ vtxNext8:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
      // re-adjust stretching factors because of fixint lerping (divide by 256)
@ -1610,7 +1602,7 @@ vtxNext8L:
  }

  // generate colors from shades
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    // construct 64-bit RGBA light
@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm)
    pvtxSrfBase = &_avtxSrfBase[iSrfVx0];
    INDEX iSrfVx;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [puwSrfToMip]
@ -2074,7 +2066,7 @@ srfVtxLoop:
    const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation);
    colSrfDiff.MultiplyRGBA( colD, colMdlDiff);

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    // setup texcoord array
    __asm {
      push    ebx
@ -2134,7 +2126,7 @@ vtxEnd:
      for( INDEX iSrfVx=0; iSrfVx<ctSrfVx; iSrfVx++) pcolSrfBase[iSrfVx] = colSrfDiffAdj;
    }
    else {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // setup color array
      const COLOR colS = colSrfDiff.ul.abgr;
      __asm {
@ -2335,7 +2327,7 @@ diffColLoop:
    // cache rotation
    const FLOATmatrix3D &m = rm.rm_mObjectRotation;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
@ -2530,7 +2522,7 @@ reflMipLoop:
    // cache object view rotation
    const FLOATmatrix3D &m = rm.rm_mObjectToView;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
--- a/Sources/Engine/Rendering/RendMisc.cpp
+++ b/Sources/Engine/Rendering/RendMisc.cpp
@ -105,10 +105,7 @@ static SLONG slTmp;

 static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
 {
- #if (defined USE_PORTABLE_C)
-  return((PIX) (f+0.9999f));
-
- #elif (defined __MSVC_INLINE__)
+ #if (defined __MSVC_INLINE__)
  PIX pixRet;
  __asm {
    fld     dword ptr [f]
@ -123,7 +120,7 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
  }
  return pixRet;

- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__)
  PIX pixRet;
  SLONG clobber;
  __asm__ __volatile__ (
@ -142,7 +139,8 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
  return pixRet;

 #else
-  #error Please write inline ASM for your platform.
+  return((PIX) (f+0.9999f));
+
 #endif
 }

--- a/Sources/Engine/Sound/SoundMixer.cpp
+++ b/Sources/Engine/Sound/SoundMixer.cpp
@ -43,17 +43,15 @@ static CSoundData *psd;

 // nasm on MacOS X is getting wrong addresses of external globals, so I have
 //  to define them in the .asm file...lame.
-#ifdef __GNU_INLINE__
-#ifdef USE_PORTABLE_C
-#define INASM 
-#else
+#if (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
 #define INASM extern
-#endif
-#else
+#elif (defined __MSVC_INLINE__)
 #define INASM static
 static __int64 mmInvFactor   = 0x00007FFF00007FFF;
 static FLOAT f65536 = 65536.0f;
 static FLOAT f4G    = 4294967296.0f;
+#else
+#define INASM static
 #endif

 INASM SLONG slMixerBufferSize;        // size in samples per channel of the destination buffers
@ -81,11 +79,7 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
  slMixerBufferSampleRate = _pSound->sl_SwfeFormat.nSamplesPerSec;

  // wipe destination mixer buffer
-  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
-  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    xor     eax,eax
@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
    shl     ecx,1 // *2 because of 32-bit src format
    rep     stosd
  }
-  #elif (defined __GNU_INLINE__)
-  // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()?
-  ULONG clob1, clob2;
-  __asm__ __volatile__ (
-    "cld                  \n\t"
-    "rep                  \n\t"
-    "stosl                \n\t"
-        : "=D" (clob1), "=c" (clob2)
-        : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2)
-        : "cc", "memory"
-  );
  #else
-    #error please write inline asm for your platform.
+  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
  #endif
 }

@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;

-  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
-  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [slSrcOffset]
@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
    shr     ecx,2   // bytes to samples per channel
    rep     movsd
  }
-  #elif (defined __GNU_INLINE__)
-  // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()?
-  ULONG clob1, clob2, clob3;
-  __asm__ __volatile__ (
-    "cld                 \n\t"
-    "rep                 \n\t"
-    "movsl               \n\t"
-      : "=S" (clob1), "=D" (clob2), "=c" (clob3)
-      : "S" (((char *)pvMixerBuffer) + slSrcOffset),
-        "D" (pDstBuffer),
-        "c" (slBytes >> 2)
-      : "cc", "memory"
-  );
  #else
-  #error please write inline asm for your platform.
+  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
  #endif
 }

@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON
  ASSERT( slBytes%2==0);
  if( slBytes<4) return;

-  #if (defined USE_PORTABLE_C)
-  // (This is untested, currently. --ryan.)
-  WORD *dest = (WORD *) pDstBuffer;
-  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
-  SLONG max = slBytes / 4;
-  for (SLONG i = 0; i < max; i++) {
-      *dest = *src;
-      dest++;    // move 16 bits.
-      src+=2;    // move 32 bits.
-  }
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [slSrcOffset]
    add     esi,D [pvMixerBuffer]
@ -184,7 +140,7 @@ copyLoop:
    jnz     copyLoop
  }

-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     %[pvMixerBuffer], %%esi         \n\t"
    "movl     %[pDstBuffer], %%edi            \n\t"
@ -204,7 +160,15 @@ copyLoop:
  );

  #else
-  #error please write inline asm for your platform.
+  // (This is untested, currently. --ryan.)
+  WORD *dest = (WORD *) pDstBuffer;
+  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
+  SLONG max = slBytes / 4;
+  for (SLONG i = 0; i < max; i++) {
+      *dest = *src;
+      dest++;    // move 16 bits.
+      src+=2;    // move 32 bits.
+  }
  #endif
 }

@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes)
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;

-  #if (defined USE_PORTABLE_C)
-  //STUBBED("ConvertMixerBuffer");
-  SWORD *dest = (SWORD *) pvMixerBuffer;
-  SLONG *src = (SLONG *) pvMixerBuffer;
-  SLONG max = slBytes / 2;
-  int tmp;
-  for (SLONG i = 0; i < max; i++) {
-      tmp = *src;
-      if (tmp>32767) tmp=32767;
-      if (tmp<-32767) tmp=-32767;
-      *dest=tmp;
-      dest++;    // move 16 bits.
-      src++;     // move 32 bits.
-  }
-
-
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [pvMixerBuffer]
@ -250,7 +197,7 @@ copyLoop:
    emms
  }

-  #elif (defined __GNU_INLINE__)
+  #elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     %[pvMixerBuffer], %%esi      \n\t"
    "movl     %[pvMixerBuffer], %%edi      \n\t"
@ -271,7 +218,20 @@ copyLoop:
  );

  #else
-  #error please write inline asm for your platform.
+
+  SWORD *dest = (SWORD *) pvMixerBuffer;
+  SLONG *src = (SLONG *) pvMixerBuffer;
+  SLONG max = slBytes / 2;
+  int tmp;
+  for (SLONG i = 0; i < max; i++) {
+      tmp = *src;
+      if (tmp>32767) tmp=32767;
+      if (tmp<-32767) tmp=-32767;
+      *dest=tmp;
+      dest++;    // move 16 bits.
+      src++;     // move 32 bits.
+  }
+
  #endif
 }

@ -323,7 +283,7 @@ void NormalizeMixerBuffer( const FLOAT fNormStrength, const SLONG slBytes, FLOAT
 }
 

-#ifdef __GNU_INLINE__
+#if (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
 // These are implemented in an external NASM file.
 extern "C" {
    void MixStereo_asm(CSoundObject *pso);
@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso)
 {
  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);

- #if (defined USE_PORTABLE_C)
-  // initialize some local vars
-  SLONG slLeftSample, slRightSample, slNextSample;
-  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
-  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
-  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
-  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
-  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
-  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
-  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
-
-  SLONG slLeftVolume_ = slLeftVolume >> 16;
-  SLONG slRightVolume_ = slRightVolume >> 16;
-
-  // loop thru source buffer
-  INDEX iCt = slMixerBufferSize;
-  FOREVER
-  {
-    // if left channel source sample came to end of sample buffer
-    if( fixLeftOfs >= fixSoundBufferSize) {
-      fixLeftOfs -= fixSoundBufferSize;
-      // if has no loop, end it
-      bEndOfSound = bNotLoop;
-    }
-    // if right channel source sample came to end of sample buffer
-    if( fixRightOfs >= fixSoundBufferSize) {
-      fixRightOfs -= fixSoundBufferSize;
-      // if has no loop, end it
-      bEndOfSound = bNotLoop;
-    }
-    // end of buffer?
-    if( iCt<=0 || bEndOfSound) break;
-
-    // fetch one lineary interpolated sample on left channel
-    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
-    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
-    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
-    // fetch one lineary interpolated sample on right channel
-    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
-    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
-    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
-
-    // filter samples
-    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
-    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
-
-    // apply stereo volume to current sample
-    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
-    slRightSample = (slLastRightSample * slRightVolume_)>>15;
-
-    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
-    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
-
-    // mix in current sample
-    slLeftSample  += pslDstBuffer[0];
-    slRightSample += pslDstBuffer[1];
-    // upper clamp
-    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
-    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
-    // lower clamp
-    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
-    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
-
-    // store samples (both channels)
-    pslDstBuffer[0] = slLeftSample;
-    pslDstBuffer[1] = slRightSample;
-
-    // modify volume  `
-    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
-    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
-
-    // advance to next sample
-    fixLeftOfs   += fixLeftStep;
-    fixRightOfs  += fixRightStep;
-    pslDstBuffer += 2;
-    iCt--;
-  }
-
- #elif (defined __MSVC_INLINE__)
+ #if (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -548,24 +430,11 @@ loopEnd:
    emms
  }

- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
   // This is implemented in an external NASM file.
   MixMono_asm(pso);

 #else
-   #error please write inline asm for your platform.
- #endif
-
-  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
-}
-
-
-// mixes one stereo 16-bit signed sound to destination buffer
-inline void MixStereo( CSoundObject *pso)
-{
-  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
-
- #if (defined USE_PORTABLE_C)
  // initialize some local vars
  SLONG slLeftSample, slRightSample, slNextSample;
  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso)
    if( iCt<=0 || bEndOfSound) break;

    // fetch one lineary interpolated sample on left channel
-    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
-    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
+    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
+    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
    // fetch one lineary interpolated sample on right channel
-    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
-    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
+    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
+    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;

    // filter samples
@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso)
    iCt--;
  }

- #elif (defined __MSVC_INLINE__)
+ #endif
+
+  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
+}
+
+
+// mixes one stereo 16-bit signed sound to destination buffer
+inline void MixStereo( CSoundObject *pso)
+{
+  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
+
+ #if (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -778,12 +658,88 @@ loopEnd:
    emms
  }

- #elif (defined __GNU_INLINE__)
+ #elif (defined __GNU_INLINE_X86_32__) && (defined USE_I386_NASM_ASM)
   // This is implemented in an external NASM file.
   MixStereo_asm(pso);

 #else
-   #error please write inline asm for your platform.
+  // initialize some local vars
+  SLONG slLeftSample, slRightSample, slNextSample;
+  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
+  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
+  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
+  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
+  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
+  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
+  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
+
+  SLONG slLeftVolume_ = slLeftVolume >> 16;
+  SLONG slRightVolume_ = slRightVolume >> 16;
+
+  // loop thru source buffer
+  INDEX iCt = slMixerBufferSize;
+  FOREVER
+  {
+    // if left channel source sample came to end of sample buffer
+    if( fixLeftOfs >= fixSoundBufferSize) {
+      fixLeftOfs -= fixSoundBufferSize;
+      // if has no loop, end it
+      bEndOfSound = bNotLoop;
+    }
+    // if right channel source sample came to end of sample buffer
+    if( fixRightOfs >= fixSoundBufferSize) {
+      fixRightOfs -= fixSoundBufferSize;
+      // if has no loop, end it
+      bEndOfSound = bNotLoop;
+    }
+    // end of buffer?
+    if( iCt<=0 || bEndOfSound) break;
+
+    // fetch one lineary interpolated sample on left channel
+    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
+    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
+    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
+    // fetch one lineary interpolated sample on right channel
+    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
+    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
+    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
+
+    // filter samples
+    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
+    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
+
+    // apply stereo volume to current sample
+    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
+    slRightSample = (slLastRightSample * slRightVolume_)>>15;
+
+    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
+    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
+
+    // mix in current sample
+    slLeftSample  += pslDstBuffer[0];
+    slRightSample += pslDstBuffer[1];
+    // upper clamp
+    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
+    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
+    // lower clamp
+    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
+    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
+
+    // store samples (both channels)
+    pslDstBuffer[0] = slLeftSample;
+    pslDstBuffer[1] = slRightSample;
+
+    // modify volume  `
+    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
+    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
+
+    // advance to next sample
+    fixLeftOfs   += fixLeftStep;
+    fixRightOfs  += fixRightStep;
+    pslDstBuffer += 2;
+    iCt--;
+  }
+
 #endif

  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
--- a/Sources/build-linux32.sh
+++ b/Sources/build-linux32.sh
@ -14,10 +14,10 @@ cd $_
 #ninja

 # This is the eventual path for amd64.
-#cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE ..
+#cmake -DCMAKE_BUILD_TYPE=Debug ..

 # Right now we force x86, though...
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_ASM=TRUE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 -DUSE_I386_NASM_ASM=TRUE ..

 make -j$NCPU

--- a/Sources/build-linux64.sh
+++ b/Sources/build-linux64.sh
@ -14,7 +14,7 @@ cd $_
 #ninja

 # This is the eventual path for amd64.
-cmake -DCMAKE_BUILD_TYPE=Debug -DUSE_I386_ASM=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug ..

 # Right now we force x86, though...
 #cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_C_FLAGS=-m32 -DCMAKE_CXX_FLAGS=-m32 ..
--- a/Sources/build-mac.sh
+++ b/Sources/build-mac.sh
@ -9,6 +9,6 @@ set -x
 rm -rf cmake-build
 mkdir $_
 cd $_
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=i386 -DUSE_I386_NASM_ASM=TRUE -DUSE_SYSTEM_SDL2=FALSE ..
 make -j$NCPU

--- a/Sources/build-mac64.sh
+++ b/Sources/build-mac64.sh
@ -9,6 +9,6 @@ set -x
 rm -rf cmake-build
 mkdir $_
 cd $_
-cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 -DUSE_I386_ASM=FALSE ..
+cmake -DCMAKE_BUILD_TYPE=Debug -DCMAKE_OSX_ARCHITECTURES=x86_64 ..
 make -j$NCPU