rework asm to always fall back to portable C code

with this there is no need to worry about x86 asm switch for other platforms.
2025-09-18 10:20:28 +02:00 · 2016-04-24 20:16:04 +03:00 · 2016-04-24 20:16:04 +03:00 · 1f70d4e242
commit 1f70d4e242
parent 78b26698ac
19 changed files with 790 additions and 1018 deletions
--- a/Sources/Engine/Base/Base.h
+++ b/Sources/Engine/Base/Base.h
@ -65,8 +65,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #else
  #warning "UNKNOWN PLATFORM IDENTIFIED!!!!"
  #define PLATFORM_UNKNOWN 1
-  #warning "USING PORTABLE C!!!"
-  #define USE_PORTABLE_C
 #endif

 #if PLATFORM_LINUX || PLATFORM_MACOSX
--- a/Sources/Engine/Base/Profiling.cpp
+++ b/Sources/Engine/Base/Profiling.cpp
@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 template class CStaticArray<CProfileCounter>;
 template class CStaticArray<CProfileTimer>;

-#if (defined USE_PORTABLE_C)
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #include <sys/time.h>
 #endif

 static inline __int64 ReadTSC_profile(void)
 {
-#if (defined USE_PORTABLE_C)
-  #ifdef __arm__
-  struct timespec tv;
-  clock_gettime(CLOCK_MONOTONIC, &tv);
-  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
-  #else
-  struct timeval tv;
-  gettimeofday(&tv, NULL);
-  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
-  #endif
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __int64 mmRet;
  __asm {
    rdtsc
@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void)
  return(mmRet);

 #else
-  #error Please implement for your platform/compiler.
+  #ifdef __arm__
+  struct timespec tv;
+  clock_gettime(CLOCK_MONOTONIC, &tv);
+  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
+  #else
+  struct timeval tv;
+  gettimeofday(&tv, NULL);
+  return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
+  #endif
+
 #endif
 }

--- a/Sources/Engine/Base/Timer.cpp
+++ b/Sources/Engine/Base/Timer.cpp
@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #include <Engine/Base/Priority.inl>

 // !!! FIXME: use SDL timer code instead and rdtsc never?
-#if (USE_PORTABLE_C) 
+#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
 #define USE_GETTIMEOFDAY 1
 #endif

--- a/Sources/Engine/Base/Types.h
+++ b/Sources/Engine/Base/Types.h
@ -229,10 +229,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));

    inline ULONG _rotl(ULONG ul, int bits)
    {
-        #if (defined USE_PORTABLE_C)
-            // DG: according to http://blog.regehr.org/archives/1063 this is fast
-            return (ul<<bits) | (ul>>(-bits&31));
-        #elif (defined __GNU_INLINE_X86_32__)
+        #if (defined __GNU_INLINE_X86_32__)
            // This, on the other hand, is wicked fast.  :)
            __asm__ __volatile__ (
                "roll %%cl, %%eax    \n\t"
@ -254,7 +251,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
            return(ul);

        #else
-            #error need inline asm for your platform.
+            // DG: according to http://blog.regehr.org/archives/1063 this is fast
+            return (ul<<bits) | (ul>>(-bits&31));
        #endif
    }

--- a/Sources/Engine/Engine.cpp
+++ b/Sources/Engine/Engine.cpp
@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD  ul_reason_for_call, LPVOID lpReser

 static void DetectCPU(void)
 {
-#if (defined USE_PORTABLE_C)  // rcg10072001
-  CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
-
-#else
-  char strVendor[12+1];
+  char strVendor[12+1] = { 0 };
  strVendor[12] = 0;
-  ULONG ulTFMS;
-  ULONG ulFeatures;
+  ULONG ulTFMS = 0;
+  ULONG ulFeatures = 0;

  #if (defined __MSVC_INLINE__)
  // test MMX presence and update flag
@ -181,10 +177,13 @@ static void DetectCPU(void)
            : "eax", "ecx", "edx", "memory"
    );

-  #else
-    #error Please implement for your platform or define USE_PORTABLE_C.
  #endif

+  if (ulTFMS == 0) {
+    CPrintF(TRANSV("  (No CPU detection in this binary.)\n"));
+    return;
+  }
+
  INDEX iType     = (ulTFMS>>12)&0x3;
  INDEX iFamily   = (ulTFMS>> 8)&0xF;
  INDEX iModel    = (ulTFMS>> 4)&0xF;
@ -215,8 +214,6 @@ static void DetectCPU(void)
  sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6);

  if( !bMMX) FatalError( TRANS("MMX support required but not present!"));
-
-#endif  // defined USE_PORTABLE_C
 }

 static void DetectCPUWrapper(void)
--- a/Sources/Engine/Graphics/Color.cpp
+++ b/Sources/Engine/Graphics/Color.cpp
@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
  if( col2==0xFFFFFFFF)   return col1;
  if( col1==0 || col2==0) return 0;

-#if (defined USE_PORTABLE_C)
-  // !!! FIXME: This...is not fast.
-  union
-  {
-    COLOR col;
-    UBYTE bytes[4];
-  } conv1;
-
-  union
-  {
-    COLOR col;
-    UBYTE bytes[4];
-  } conv2;
-
-  conv1.col = col1;
-  conv2.col = col2;
-  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
-  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
-  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
-  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
-
-  return(conv1.col);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  COLOR colRet;
  __asm {
    xor     ebx,ebx
@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2)

  return colRet;
 #else
-  #error please fill in inline assembly for your platform.
-#endif
-}
-
-
-// fast color additon function - RES = clamp (1ST + 2ND)
-COLOR AddColors( COLOR col1, COLOR col2) 
-{
-  if( col1==0) return col2;
-  if( col2==0) return col1;
-  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
-  COLOR colRet;
-
-#if (defined USE_PORTABLE_C)
  // !!! FIXME: This...is not fast.
  union
  {
@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2)
    COLOR col;
    UBYTE bytes[4];
  } conv2;
-  #define MINVAL(a, b) ((a)>(b))?(b):(a)

  conv1.col = col1;
  conv2.col = col2;
-  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
-  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
-  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
-  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
-  #undef MINVAL
+  conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
+  conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
+  conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
+  conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);

-  colRet = conv1.col;
+  return(conv1.col);
+#endif
+}

-#elif (defined __MSVC_INLINE__)
+
+// fast color additon function - RES = clamp (1ST + 2ND)
+COLOR AddColors( COLOR col1, COLOR col2) 
+{
+  if( col1==0) return col2;
+  if( col2==0) return col1;
+  if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
+  COLOR colRet;
+
+#if (defined __MSVC_INLINE__)
  __asm {
    xor     ebx,ebx
    mov     esi,255
@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2)
  );

 #else
-  #error please fill in inline assembly for your platform.
+  // !!! FIXME: This...is not fast.
+  union
+  {
+    COLOR col;
+    UBYTE bytes[4];
+  } conv1;
+
+  union
+  {
+    COLOR col;
+    UBYTE bytes[4];
+  } conv2;
+  #define MINVAL(a, b) ((a)>(b))?(b):(a)
+
+  conv1.col = col1;
+  conv2.col = col2;
+  conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
+  conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
+  conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
+  conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
+  #undef MINVAL
+
+  colRet = conv1.col;
 #endif

  return colRet;
@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
 // multiple conversion from OpenGL color to DirectX color
 extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct)
 {
-#if (defined USE_PORTABLE_C)
-  //#error write me.
-  for (int i=0; i<ct; i++) {
-    ULONG tmp = pulSrc[i];
-    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
-  }
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov   esi,dword ptr [pulSrc]
    mov   edi,dword ptr [pulDst]
@ -678,12 +665,12 @@ colSkip2:
    mov   dword ptr [edi],eax
 colSkip1:
  }
-
-#elif (defined __GNU_INLINE_X86_32__)
-  STUBBED("convert to inline asm.");
-
 #else
-  #error please fill in inline assembly for your platform.
+  for (int i=0; i<ct; i++) {
+    ULONG tmp = pulSrc[i];
+    pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
+  }
+
 #endif
 }

--- a/Sources/Engine/Graphics/Color.h
+++ b/Sources/Engine/Graphics/Color.h
@ -204,19 +204,7 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito
 __forceinline ULONG ByteSwap( ULONG ul)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
-	ul = ( ((ul << 24)            ) |
-           ((ul << 8) & 0x00FF0000) |
-           ((ul >> 8) & 0x0000FF00) |
-           ((ul >> 24)            ) );
-
-    #if (defined PLATFORM_BIGENDIAN)
-    BYTESWAP(ul);  // !!! FIXME: May not be right!
-    #endif
-
-    return(ul);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [ul]
@ -234,16 +222,22 @@ __forceinline ULONG ByteSwap( ULONG ul)
  return(ul);

 #else
-  #error please define for your platform.
+  ul = ( ((ul << 24)            ) |
+         ((ul << 8) & 0x00FF0000) |
+         ((ul >> 8) & 0x0000FF00) |
+         ((ul >> 24)            ) );
+
+  #if (defined PLATFORM_BIGENDIAN)
+  BYTESWAP(ul);  // !!! FIXME: May not be right!
+  #endif
+
+  return(ul);
 #endif
 }

 __forceinline ULONG rgba2argb( ULONG ul)
 {
-#if (defined USE_PORTABLE_C)
-	return( (ul << 24) | (ul >> 8) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [ul]
@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul)
  return ulRet;

 #else
-  #error please define for your platform.
+  return (ul << 24) | (ul >> 8);
+
 #endif
 }

 __forceinline ULONG abgr2argb( COLOR col)
 {
-#if (defined USE_PORTABLE_C)
-	// this could be simplified, this is just a safe conversion from asm code
-	col = ( ((col << 24)            ) |
-            ((col << 8) & 0x00FF0000) |
-            ((col >> 8) & 0x0000FF00) |
-            ((col >> 24)            ) );
-	return( (col << 24) | (col >> 8) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  ULONG ulRet;
  __asm {
    mov   eax,dword ptr [col]
@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col)
  return ulRet;

 #else
-  #error please define for your platform.
+  // this could be simplified, this is just a safe conversion from asm code
+  col = ( ((col << 24)            ) |
+          ((col << 8) & 0x00FF0000) |
+          ((col >> 8) & 0x0000FF00) |
+          ((col >> 24)            ) );
+  return( (col << 24) | (col >> 8) );
+
 #endif
 }

@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct);
 // fast memory copy of ULONGs
 inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 {
-#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  memcpy( pulDst, pulSrc, ctLongs*4);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   esi,dword ptr [pulSrc]
@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
    mov   ecx,dword ptr [ctLongs]
    rep   movsd
  }
-
-#elif (defined __GNU_INLINE_X86_32__)
-    // I haven't benchmarked it, but in many cases, memcpy() becomes an
-    //  inline (asm?) macro on GNU platforms, so this might not be a
-    //  speed gain at all over the USE_PORTABLE_C version.
-    // You Have Been Warned. --ryan.
-  __asm__ __volatile__ (
-    "cld    \n\t"
-    "rep    \n\t"
-    "movsd  \n\t"
-        : "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs)
-        : "S" (pulSrc), "D" (pulDst), "c" (ctLongs)
-        : "cc", "memory"
-  );
-
 #else
-# error Please fill this in for your platform.
+  memcpy( pulDst, pulSrc, ctLongs*4);
 #endif
 }

@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
 // fast memory set of ULONGs
 inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
 {
-#if (defined USE_PORTABLE_C)
-  for( INDEX i=0; i<ctLongs; i++)
-    pulDst[i] = ulVal;
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov   eax,dword ptr [ulVal]
@ -370,7 +341,9 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
  );

 #else
-# error Please fill this in for your platform.
+  for( INDEX i=0; i<ctLongs; i++)
+    pulDst[i] = ulVal;
+
 #endif
 }

--- a/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
+++ b/Sources/Engine/Graphics/DrawPort_RenderScene.cpp
@ -38,16 +38,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-#define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
-#define ASMOPT 1
-#elif (defined __GNU_INLINE_X86_32__)
-#define ASMOPT 1
-#else
-#define ASMOPT 0
-#endif
-
 #define MAXTEXUNITS   4
 #define SHADOWTEXTURE 3

@ -153,8 +143,7 @@ void AddElements( ScenePolygon *pspo)
  const INDEX ctElems = pspo->spo_ctElements;
  INDEX *piDst = _aiElements.Push(ctElems);

-#if (ASMOPT == 1)
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,D [pspo]
    mov     ecx,D [ctElems]
@ -184,7 +173,7 @@ elemRest:
    mov     D [edi],eax
 elemDone:
  }
- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[ctElems], %%ecx      \n\t"
    "movl    %[piDst], %%edi        \n\t"
@ -219,11 +208,6 @@ elemDone:
          "cc", "memory"
  );

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else
  const INDEX iVtx0Pass = pspo->spo_iVtx0Pass;
  const INDEX *piSrc = pspo->spo_piElements;
@ -495,9 +479,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
  // determine maximum used groups
  ASSERT( _ctGroupsCount);

-#if ASMOPT == 1
-
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,2
    bsr     ecx,D [_ctGroupsCount]
@ -505,7 +487,7 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
    mov     D [_ctGroupsCount],eax
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl     $2, %%eax          \n\t"
    "bsrl     (%%esi), %%ecx     \n\t"
@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
        : "eax", "ecx", "cc", "memory"
  );

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else
  // emulate x86's bsr opcode...not fast.  :/
  register DWORD val = _ctGroupsCount;
@ -858,10 +835,7 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn
      continue;
    }

-// !!! FIXME: rcg11232001 This inline conversion is broken. Use the
-// !!! FIXME: rcg11232001  C version for now with GCC.
-#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__) && (!defined __INTEL_COMPILER))
-  #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
    __asm {
      mov     esi,D [pspo]
      mov     edi,D [iMappingOffset]
@ -915,7 +889,7 @@ vtxLoop:
 /*
    // !!! FIXME: rcg11232001 This inline conversion is broken. Use the
    // !!! FIXME: rcg11232001  C version for now on Linux.
- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
    STUBBED("debug this");
    __asm__ __volatile__ (
      "0:                                  \n\t" // vtxLoop
@ -956,11 +930,6 @@ vtxLoop:
    );
 */

- #else
-   #error Please write inline ASM for your platform.
-
- #endif
-
 #else

    // diffuse mapping
--- a/Sources/Engine/Graphics/Fog.cpp
+++ b/Sources/Engine/Graphics/Fog.cpp
@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ)
  // need to upload from RGBA format
  const PIX pixTextureSize = pixSizeI*pixSizeJ;

- #if (defined USE_PORTABLE_C)
-   const UBYTE* src = pubTexture;
-   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
-   for (int i=0; i<pixTextureSize; i++) {
-    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
-    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
-      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
-    src++;
-    dst++;
-   }
-
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pubTexture]
    mov     edi,D [pubTexture]
@ -95,7 +84,7 @@ pixLoop:
    jnz     pixLoop
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  __asm__ __volatile__ (
    "movl    %[pubTexture], %%esi      \n\t"
    "movl    %[pixTextureSize], %%ecx  \n\t"
@ -115,10 +104,18 @@ pixLoop:
        : "eax", "ecx", "esi", "edi", "cc", "memory"
  );

- #else
-   #error Write inline ASM for your platform.
+#else
+   const UBYTE* src = pubTexture;
+   DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
+   for (int i=0; i<pixTextureSize; i++) {
+    const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
+    *dst = ((tmp << 24) & 0xff000000 ) | ((tmp <<  8) & 0x00ff0000 ) |
+      ((tmp >>  8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
+    src++;
+    dst++;
+   }

- #endif
+#endif

  // determine internal format
  extern INDEX gap_bAllowGrayTextures;
--- a/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
+++ b/Sources/Engine/Graphics/Gfx_OpenGL_Textures.cpp
@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      if( pixSizeV==0) pixSizeV=1;
      pixSize = pixSizeU*pixSizeV;

-      #if (defined USE_PORTABLE_C)
-      // Basically average every other pixel...
-      UWORD w = 0;
-      UBYTE *dptr = (UBYTE *) pulDst;
-      UBYTE *sptr = (UBYTE *) pulSrc;
-      #if 0
-      pixSize *= 4;
-      for (PIX i = 0; i < pixSize; i++)
-      {
-        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
-        dptr++;
-        sptr += 2;
-      }
-      #else
-      for (PIX i = 0; i < pixSize; i++)
-      {
-        for (PIX j = 0; j < 4; j++)
-        {
-          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
-          dptr++;
-          sptr++;
-        }
-        sptr += 4;
-      }
-      #endif
-      #elif (defined __MSVC_INLINE__)
+      #if (defined __MSVC_INLINE__)
      __asm {   
        pxor    mm0,mm0
        mov     esi,D [pulSrc]
@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
      );

      #else
-      #error Please write inline ASM for your platform.
+      // Basically average every other pixel...
+      UWORD w = 0;
+      UBYTE *dptr = (UBYTE *) pulDst;
+      UBYTE *sptr = (UBYTE *) pulSrc;
+      #if 0
+      pixSize *= 4;
+      for (PIX i = 0; i < pixSize; i++)
+      {
+        *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
+        dptr++;
+        sptr += 2;
+      }
+      #else
+      for (PIX i = 0; i < pixSize; i++)
+      {
+        for (PIX j = 0; j < 4; j++)
+        {
+          *dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
+          dptr++;
+          sptr++;
+        }
+        sptr += 4;
+      }
+      #endif
      #endif

      // upload mipmap
--- a/Sources/Engine/Graphics/Graphics.cpp
+++ b/Sources/Engine/Graphics/Graphics.cpp
@ -209,58 +209,7 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
  if( bBilinear) // type of filtering?
  { // BILINEAR

-   #if (defined USE_PORTABLE_C)
-	UBYTE *src = (UBYTE *) pulSrcMipmap;
-	UBYTE *dest = (UBYTE *) pulDstMipmap;
-	for (int i = 0 ; i < pixHeight; i++)
-	{
-		for (int j = 0; j < pixWidth; j++)
-		{
-			// Grab pixels from image
-			UWORD upleft[4];
-			UWORD upright[4];
-			UWORD downleft[4];
-			UWORD downright[4];
-			upleft[0] = *(src + 0);
-			upleft[1] = *(src + 1);
-			upleft[2] = *(src + 2);
-			upleft[3] = *(src + 3);
-			upright[0] = *(src + 4);
-			upright[1] = *(src + 5);
-			upright[2] = *(src + 6);
-			upright[3] = *(src + 7);
-
-			downleft[0] = *(src + pixWidth*8 + 0);
-			downleft[1] = *(src + pixWidth*8 + 1);
-			downleft[2] = *(src + pixWidth*8 + 2);
-			downleft[3] = *(src + pixWidth*8 + 3);
-			downright[0] = *(src + pixWidth*8 + 4);
-			downright[1] = *(src + pixWidth*8 + 5);
-			downright[2] = *(src + pixWidth*8 + 6);
-			downright[3] = *(src + pixWidth*8 + 7);
-
-			UWORD answer[4];
-			answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2;
-			answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2;
-			answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2;
-			answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2;
-			answer[0] /= 4;
-			answer[1] /= 4;
-			answer[2] /= 4;
-			answer[3] /= 4;
-
-			*(dest + 0) = answer[0];
-			*(dest + 1) = answer[1];
-			*(dest + 2) = answer[2];
-			*(dest + 3) = answer[3];
-
-			src += 8;
-			dest += 4;
-		}
-		src += 8*pixWidth;
-    }
-
-   #elif (defined __MSVC_INLINE__)
+   #if (defined __MSVC_INLINE__)
    __asm {
      pxor    mm0,mm0
      mov     ebx,D [pixWidth]
@ -346,43 +295,63 @@ pixLoopN:
    );

   #else
-     #error Write inline asm for your platform.
+	UBYTE *src = (UBYTE *) pulSrcMipmap;
+	UBYTE *dest = (UBYTE *) pulDstMipmap;
+	for (int i = 0 ; i < pixHeight; i++)
+	{
+		for (int j = 0; j < pixWidth; j++)
+		{
+			// Grab pixels from image
+			UWORD upleft[4];
+			UWORD upright[4];
+			UWORD downleft[4];
+			UWORD downright[4];
+			upleft[0] = *(src + 0);
+			upleft[1] = *(src + 1);
+			upleft[2] = *(src + 2);
+			upleft[3] = *(src + 3);
+			upright[0] = *(src + 4);
+			upright[1] = *(src + 5);
+			upright[2] = *(src + 6);
+			upright[3] = *(src + 7);
+
+			downleft[0] = *(src + pixWidth*8 + 0);
+			downleft[1] = *(src + pixWidth*8 + 1);
+			downleft[2] = *(src + pixWidth*8 + 2);
+			downleft[3] = *(src + pixWidth*8 + 3);
+			downright[0] = *(src + pixWidth*8 + 4);
+			downright[1] = *(src + pixWidth*8 + 5);
+			downright[2] = *(src + pixWidth*8 + 6);
+			downright[3] = *(src + pixWidth*8 + 7);
+
+			UWORD answer[4];
+			answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2;
+			answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2;
+			answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2;
+			answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2;
+			answer[0] /= 4;
+			answer[1] /= 4;
+			answer[2] /= 4;
+			answer[3] /= 4;
+
+			*(dest + 0) = answer[0];
+			*(dest + 1) = answer[1];
+			*(dest + 2) = answer[2];
+			*(dest + 3) = answer[3];
+
+			src += 8;
+			dest += 4;
+		}
+		src += 8*pixWidth;
+    }
+
   #endif
    }
    else
    { // NEAREST-NEIGHBOUR but with border preserving
       ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL;

-   #if (defined USE_PORTABLE_C)
-
-     PIX offset = 0;
-     ulRowModulo /= 4;
-
-     for (int q = 0; q < 2; q++)
-     {
-         for (PIX i = pixHeight / 2; i > 0; i--)
-         {
-             for (PIX j = pixWidth / 2; j > 0; j--)
-             {
-                 *pulDstMipmap = *(pulSrcMipmap + offset);
-                 pulSrcMipmap += 2;
-                 pulDstMipmap++;
-             }
-
-             for (PIX j = pixWidth / 2; j > 0; j--)
-             {
-                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
-                 pulSrcMipmap += 2;
-                 pulDstMipmap++;
-             }
-
-             pulSrcMipmap += ulRowModulo;
-        }
-
-        offset = pixWidth * 2;
-     }
-
-   #elif (defined __MSVC_INLINE__)
+   #if (defined __MSVC_INLINE__)
    __asm {
      xor     ebx,ebx
      mov     esi,D [pulSrcMipmap]
@ -493,7 +462,33 @@ fullEnd:
    );

   #else
-     #error Write inline asm for your platform.
+     PIX offset = 0;
+     ulRowModulo /= 4;
+
+     for (int q = 0; q < 2; q++)
+     {
+         for (PIX i = pixHeight / 2; i > 0; i--)
+         {
+             for (PIX j = pixWidth / 2; j > 0; j--)
+             {
+                 *pulDstMipmap = *(pulSrcMipmap + offset);
+                 pulSrcMipmap += 2;
+                 pulDstMipmap++;
+             }
+
+             for (PIX j = pixWidth / 2; j > 0; j--)
+             {
+                 *pulDstMipmap = *(pulSrcMipmap + offset + 1);
+                 pulSrcMipmap += 2;
+                 pulDstMipmap++;
+             }
+
+             pulSrcMipmap += ulRowModulo;
+        }
+
+        offset = pixWidth * 2;
+     }
+
   #endif
  }
 }
@ -649,7 +644,7 @@ __int64 mmShifter = 0;
 __int64 mmMask  = 0;
 ULONG *pulDitherTable;

-#ifdef USE_PORTABLE_C
+#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 extern const UBYTE *pubClipByte;
 // increment a byte without overflowing it
 static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd)
@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth
 // ------------------------------- ordered matrix dithering routine

 ditherOrder:
-#if (defined USE_PORTABLE_C)
-  union uConv
-  {
-    ULONG val;
-    DWORD dwords[2];
-    UWORD words[4];
-    WORD  iwords[4];
-    UBYTE bytes[8];
-  };
-  for (int i=0; i<pixHeight; i++) {
-    int idx = i&3;
-    uConv dith;
-    dith.val = pulDitherTable[idx];
-    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
-    dith.val &= mmMask;
-    uConv* src = (uConv*)(pulSrc+i*pixWidth);
-    uConv* dst = (uConv*)(pulDst+i*pixWidth);
-    for (int j=0; j<pixWidth; j+=2) {
-      uConv p=src[0];
-      for (int k=0; k<8; k++) {
-        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
-      }
-      dst[0] = p;
-      src++;
-      dst++;
-    }
-  }
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [pulSrc]
    mov     edi,D [pulDst]
@ -912,7 +879,33 @@ nextRowO:
  );

 #else
-  #error Write inline asm for your platform.
+  union uConv
+  {
+    ULONG val;
+    DWORD dwords[2];
+    UWORD words[4];
+    WORD  iwords[4];
+    UBYTE bytes[8];
+  };
+  for (int i=0; i<pixHeight; i++) {
+    int idx = i&3;
+    uConv dith;
+    dith.val = pulDitherTable[idx];
+    for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
+    dith.val &= mmMask;
+    uConv* src = (uConv*)(pulSrc+i*pixWidth);
+    uConv* dst = (uConv*)(pulDst+i*pixWidth);
+    for (int j=0; j<pixWidth; j+=2) {
+      uConv p=src[0];
+      for (int k=0; k<8; k++) {
+        IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
+      }
+      dst[0] = p;
+      src++;
+      dst++;
+    }
+  }
+
 #endif

  goto theEnd;
@ -924,34 +917,7 @@ ditherError:
  if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
  // slModulo+=4;
  // now, dither destination
-#if (defined USE_PORTABLE_C)
-  #if 1 //SEB doesn't works....
-  for (int i=0; i<pixHeight-1; i++) {
-    int step = (i&1)?-4:+4;
-    const UBYTE ubMask = (mmErrDiffMask&0xff);
-    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
-    if(i&1) src+=pixWidth*4;
-    // left to right or right to left
-    for (int j=0; j<pixWidth-1; j++) {
-      uConv p1, p3, p5, p7;
-      src+=step;
-      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
-      //p1.val &= mmErrDiffMask;
-      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
-                                p5.words[k] = (p1.words[k]*5)>>4;
-                                p7.words[k] = (p1.words[k]*7)>>4; }
-      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
-      for (int k=0; k<4; k++) { 
-        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
-        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
-      }
-    }
-  }
-  #endif
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    mov     esi,D [pulDst]
@ -1157,7 +1123,32 @@ allDoneE:
  );

 #else
-  #error Write inline asm for your platform.
+  #if 1 //SEB doesn't works....
+  for (int i=0; i<pixHeight-1; i++) {
+    int step = (i&1)?-4:+4;
+    const UBYTE ubMask = (mmErrDiffMask&0xff);
+    UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
+    if(i&1) src+=pixWidth*4;
+    // left to right or right to left
+    for (int j=0; j<pixWidth-1; j++) {
+      uConv p1, p3, p5, p7;
+      src+=step;
+      for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
+      //p1.val &= mmErrDiffMask;
+      for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
+                                p5.words[k] = (p1.words[k]*5)>>4;
+                                p7.words[k] = (p1.words[k]*7)>>4; }
+      for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
+      for (int k=0; k<4; k++) { 
+        IncrementByteWithClip( src[k + step]                 , p7.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 +0    +k], p3.words[k]);
+        IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
+      }
+    }
+  }
+  #endif
+
 #endif

  goto theEnd;
@ -1265,7 +1256,7 @@ extern "C" {
 }


-#ifdef USE_PORTABLE_C
+#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
 typedef SWORD ExtPix[4];

 static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI
    _mm_empty();  // we're done, clear out the MMX registers!


-#elif (defined USE_PORTABLE_C)
-    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
-    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
-
-    ULONG *src = pulSrc;
-    ULONG *dst = pulDst;
-    ULONG *rowptr = aulRows;
-
-    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
-    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
-    EXTPIXFROMINT64(mmCm);
-    EXTPIXFROMINT64(mmCe);
-    EXTPIXFROMINT64(mmCc);
-    EXTPIXFROMINT64(mmEch);
-    EXTPIXFROMINT64(mmEcl);
-    EXTPIXFROMINT64(mmEe);
-    EXTPIXFROMINT64(mmEm);
-    EXTPIXFROMINT64(mmMm);
-    EXTPIXFROMINT64(mmMe);
-    EXTPIXFROMINT64(mmMc);
-    EXTPIXFROMINT64(mmAdd);
-    EXTPIXFROMINT64(mmInvDiv);
-    #undef EXTPIXFROMINT64
-
-    // ----------------------- process upper left corner
-    extend_pixel(src[0], rmm1);
-    extend_pixel(src[1], rmm2);
-    extend_pixel(src[pixCanvasWidth], rmm3);
-    extend_pixel(src[pixCanvasWidth+1], rmm4);
-
-    extpix_add(rmm2, rmm3);
-    extpix_mul(rmm1, rmmCm);
-    extpix_mul(rmm2, rmmCe);
-    extpix_mul(rmm4, rmmCc);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm4);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    *(rowptr++) = unextend_pixel(rmm1);
-    
-    src++;
-
-    // ----------------------- process upper edge pixels
-    for (PIX i = pixWidth - 2; i != 0; i--)
-    {
-        extend_pixel(src[-1], rmm1);
-        extend_pixel(src[0], rmm2);
-        extend_pixel(src[1], rmm3);
-        extend_pixel(src[pixCanvasWidth-1], rmm4);
-        extend_pixel(src[pixCanvasWidth], rmm5);
-        extend_pixel(src[pixCanvasWidth+1], rmm6);
-
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm4, rmm6);
-        extpix_mul(rmm1, rmmEch);
-        extpix_mul(rmm2, rmmEm);
-        extpix_mul(rmm4, rmmEcl);
-        extpix_mul(rmm5, rmmEe);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm4);
-        extpix_add(rmm1, rmm5);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        *(rowptr++) = unextend_pixel(rmm1);
-        src++;
-    }
-
-    // ----------------------- process upper right corner
-
-    extend_pixel(src[-1], rmm1);
-    extend_pixel(src[0], rmm2);
-    extend_pixel(src[pixCanvasWidth-1], rmm3);
-    extend_pixel(src[pixCanvasWidth], rmm4);
-
-    extpix_add(rmm1, rmm4);
-    extpix_mul(rmm1, rmmCe);
-    extpix_mul(rmm2, rmmCm);
-    extpix_mul(rmm3, rmmCc);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm3);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    *rowptr = unextend_pixel(rmm1);
-
-// ----------------------- process bitmap middle pixels
-
-    dst += slCanvasWidth;
-    src += slModulo1;
-
-    // for each row
-    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
-    {
-        rowptr = aulRows;
-
-        // process left edge pixel
-        extend_pixel(src[-pixCanvasWidth], rmm1);
-        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
-        extend_pixel(src[0], rmm3);
-        extend_pixel(src[1], rmm4);
-        extend_pixel(src[pixCanvasWidth], rmm5);
-        extend_pixel(src[pixCanvasWidth+1], rmm6);
-
-        extpix_add(rmm1, rmm5);
-        extpix_add(rmm2, rmm6);
-        extpix_mul(rmm1, rmmEch);
-        extpix_mul(rmm2, rmmEcl);
-        extpix_mul(rmm3, rmmEm);
-        extpix_mul(rmm4, rmmEe);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm1, rmm4);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        *(rowptr++) = unextend_pixel(rmm1);
-        src++;
-        dst++;
-
-        // for each pixel in current row
-        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
-        {
-            // prepare upper convolution row
-            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-            extend_pixel(src[-pixCanvasWidth], rmm2);
-            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
-
-            // prepare middle convolution row
-            extend_pixel(src[-1], rmm4);
-            extend_pixel(src[0], rmm5);
-            extend_pixel(src[1], rmm6);
-
-            // free some registers
-            extpix_add(rmm1, rmm3);
-            extpix_add(rmm2, rmm4);
-            extpix_mul(rmm5, rmmMm);
-
-            // prepare lower convolution row
-            extend_pixel(src[pixCanvasWidth-1], rmm3);
-            extend_pixel(src[pixCanvasWidth], rmm4);
-            extend_pixel(src[pixCanvasWidth+1], rmm7);
-
-            // calc weightened value
-            extpix_add(rmm2, rmm6);
-            extpix_add(rmm1, rmm3);
-            extpix_add(rmm2, rmm4);
-            extpix_add(rmm1, rmm7);
-            extpix_mul(rmm2, rmmMe);
-            extpix_mul(rmm1, rmmMc);
-            extpix_add(rmm2, rmm5);
-            extpix_add(rmm1, rmm2);
-
-            // calc and store wightened value
-            extpix_adds(rmm1, rmmAdd);
-            extpix_mulhi(rmm1, rmmInvDiv);
-            dst[-pixCanvasWidth] = *rowptr;
-            *(rowptr++) = unextend_pixel(rmm1);
-
-            // advance to next pixel
-            src++;
-            dst++;
-        }
-
-        // process right edge pixel
-        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-        extend_pixel(src[-pixCanvasWidth], rmm2);
-        extend_pixel(src[-1], rmm3);
-        extend_pixel(src[0], rmm4);
-        extend_pixel(src[pixCanvasWidth-1], rmm5);
-        extend_pixel(src[pixCanvasWidth], rmm6);
-
-        extpix_add(rmm1, rmm5);
-        extpix_add(rmm2, rmm6);
-        extpix_mul(rmm1, rmmEcl);
-        extpix_mul(rmm2, rmmEch);
-        extpix_mul(rmm3, rmmEe);
-        extpix_mul(rmm4, rmmEm);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm1, rmm4);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        *rowptr = unextend_pixel(rmm1);
-
-        // advance to next row
-        src += slModulo1;
-        dst += slModulo1;
-    }
-
-    // ----------------------- process lower left corner
-    rowptr = aulRows;
-    extend_pixel(src[-pixCanvasWidth], rmm1);
-    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
-    extend_pixel(src[0], rmm3);
-    extend_pixel(src[1], rmm4);
-
-    extpix_add(rmm1, rmm4);
-    extpix_mul(rmm1, rmmCe);
-    extpix_mul(rmm2, rmmCc);
-    extpix_mul(rmm3, rmmCm);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm3);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    dst[-pixCanvasWidth] = *rowptr;
-    dst[0] = unextend_pixel(rmm1);
-
-    src++;
-    dst++;
-    rowptr++;
-
-    // ----------------------- process lower edge pixels
-    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
-    {
-        // for each pixel
-        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-        extend_pixel(src[-pixCanvasWidth], rmm2);
-        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
-        extend_pixel(src[-1], rmm4);
-        extend_pixel(src[0], rmm5);
-        extend_pixel(src[1], rmm6);
-
-        extpix_add(rmm1, rmm3);
-        extpix_add(rmm4, rmm6);
-        extpix_mul(rmm1, rmmEcl);
-        extpix_mul(rmm2, rmmEe);
-        extpix_mul(rmm4, rmmEch);
-        extpix_mul(rmm5, rmmEm);
-        extpix_add(rmm1, rmm2);
-        extpix_add(rmm1, rmm4);
-        extpix_add(rmm1, rmm5);
-        extpix_adds(rmm1, rmmAdd);
-        extpix_mulhi(rmm1, rmmInvDiv);
-        dst[-pixCanvasWidth] = *rowptr;
-        dst[0] = unextend_pixel(rmm1);
-
-        // advance to next pixel
-        src++;
-        dst++;
-        rowptr++;
-    }
-
-    // ----------------------- lower right corners
-    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
-    extend_pixel(src[-pixCanvasWidth], rmm2);
-    extend_pixel(src[-1], rmm3);
-    extend_pixel(src[0], rmm4);
-
-    extpix_add(rmm2, rmm3);
-    extpix_mul(rmm1, rmmCc);
-    extpix_mul(rmm2, rmmCe);
-    extpix_mul(rmm4, rmmCm);
-    extpix_add(rmm1, rmm2);
-    extpix_add(rmm1, rmm4);
-    extpix_adds(rmm1, rmmAdd);
-    extpix_mulhi(rmm1, rmmInvDiv);
-    dst[-pixCanvasWidth] = *rowptr;
-    dst[0] = unextend_pixel(rmm1);
-
 #elif (defined __MSVC_INLINE__)
  __asm {
    cld
@ -2537,7 +2269,264 @@ lowerLoop:
  );

 #else
-  #error Write inline asm for your platform.
+    slModulo1 /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
+    slCanvasWidth /= BYTES_PER_TEXEL;  // C++ handles incrementing by sizeof type
+
+    ULONG *src = pulSrc;
+    ULONG *dst = pulDst;
+    ULONG *rowptr = aulRows;
+
+    ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
+    #define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
+    EXTPIXFROMINT64(mmCm);
+    EXTPIXFROMINT64(mmCe);
+    EXTPIXFROMINT64(mmCc);
+    EXTPIXFROMINT64(mmEch);
+    EXTPIXFROMINT64(mmEcl);
+    EXTPIXFROMINT64(mmEe);
+    EXTPIXFROMINT64(mmEm);
+    EXTPIXFROMINT64(mmMm);
+    EXTPIXFROMINT64(mmMe);
+    EXTPIXFROMINT64(mmMc);
+    EXTPIXFROMINT64(mmAdd);
+    EXTPIXFROMINT64(mmInvDiv);
+    #undef EXTPIXFROMINT64
+
+    // ----------------------- process upper left corner
+    extend_pixel(src[0], rmm1);
+    extend_pixel(src[1], rmm2);
+    extend_pixel(src[pixCanvasWidth], rmm3);
+    extend_pixel(src[pixCanvasWidth+1], rmm4);
+
+    extpix_add(rmm2, rmm3);
+    extpix_mul(rmm1, rmmCm);
+    extpix_mul(rmm2, rmmCe);
+    extpix_mul(rmm4, rmmCc);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm4);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    *(rowptr++) = unextend_pixel(rmm1);
+    
+    src++;
+
+    // ----------------------- process upper edge pixels
+    for (PIX i = pixWidth - 2; i != 0; i--)
+    {
+        extend_pixel(src[-1], rmm1);
+        extend_pixel(src[0], rmm2);
+        extend_pixel(src[1], rmm3);
+        extend_pixel(src[pixCanvasWidth-1], rmm4);
+        extend_pixel(src[pixCanvasWidth], rmm5);
+        extend_pixel(src[pixCanvasWidth+1], rmm6);
+
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm4, rmm6);
+        extpix_mul(rmm1, rmmEch);
+        extpix_mul(rmm2, rmmEm);
+        extpix_mul(rmm4, rmmEcl);
+        extpix_mul(rmm5, rmmEe);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm4);
+        extpix_add(rmm1, rmm5);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        *(rowptr++) = unextend_pixel(rmm1);
+        src++;
+    }
+
+    // ----------------------- process upper right corner
+
+    extend_pixel(src[-1], rmm1);
+    extend_pixel(src[0], rmm2);
+    extend_pixel(src[pixCanvasWidth-1], rmm3);
+    extend_pixel(src[pixCanvasWidth], rmm4);
+
+    extpix_add(rmm1, rmm4);
+    extpix_mul(rmm1, rmmCe);
+    extpix_mul(rmm2, rmmCm);
+    extpix_mul(rmm3, rmmCc);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm3);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    *rowptr = unextend_pixel(rmm1);
+
+// ----------------------- process bitmap middle pixels
+
+    dst += slCanvasWidth;
+    src += slModulo1;
+
+    // for each row
+    for (size_t i = pixHeight-2; i != 0; i--)  // rowLoop
+    {
+        rowptr = aulRows;
+
+        // process left edge pixel
+        extend_pixel(src[-pixCanvasWidth], rmm1);
+        extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
+        extend_pixel(src[0], rmm3);
+        extend_pixel(src[1], rmm4);
+        extend_pixel(src[pixCanvasWidth], rmm5);
+        extend_pixel(src[pixCanvasWidth+1], rmm6);
+
+        extpix_add(rmm1, rmm5);
+        extpix_add(rmm2, rmm6);
+        extpix_mul(rmm1, rmmEch);
+        extpix_mul(rmm2, rmmEcl);
+        extpix_mul(rmm3, rmmEm);
+        extpix_mul(rmm4, rmmEe);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm1, rmm4);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        *(rowptr++) = unextend_pixel(rmm1);
+        src++;
+        dst++;
+
+        // for each pixel in current row
+        for (size_t j = pixWidth-2; j != 0; j--)  // pixLoop
+        {
+            // prepare upper convolution row
+            extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+            extend_pixel(src[-pixCanvasWidth], rmm2);
+            extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
+
+            // prepare middle convolution row
+            extend_pixel(src[-1], rmm4);
+            extend_pixel(src[0], rmm5);
+            extend_pixel(src[1], rmm6);
+
+            // free some registers
+            extpix_add(rmm1, rmm3);
+            extpix_add(rmm2, rmm4);
+            extpix_mul(rmm5, rmmMm);
+
+            // prepare lower convolution row
+            extend_pixel(src[pixCanvasWidth-1], rmm3);
+            extend_pixel(src[pixCanvasWidth], rmm4);
+            extend_pixel(src[pixCanvasWidth+1], rmm7);
+
+            // calc weightened value
+            extpix_add(rmm2, rmm6);
+            extpix_add(rmm1, rmm3);
+            extpix_add(rmm2, rmm4);
+            extpix_add(rmm1, rmm7);
+            extpix_mul(rmm2, rmmMe);
+            extpix_mul(rmm1, rmmMc);
+            extpix_add(rmm2, rmm5);
+            extpix_add(rmm1, rmm2);
+
+            // calc and store wightened value
+            extpix_adds(rmm1, rmmAdd);
+            extpix_mulhi(rmm1, rmmInvDiv);
+            dst[-pixCanvasWidth] = *rowptr;
+            *(rowptr++) = unextend_pixel(rmm1);
+
+            // advance to next pixel
+            src++;
+            dst++;
+        }
+
+        // process right edge pixel
+        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+        extend_pixel(src[-pixCanvasWidth], rmm2);
+        extend_pixel(src[-1], rmm3);
+        extend_pixel(src[0], rmm4);
+        extend_pixel(src[pixCanvasWidth-1], rmm5);
+        extend_pixel(src[pixCanvasWidth], rmm6);
+
+        extpix_add(rmm1, rmm5);
+        extpix_add(rmm2, rmm6);
+        extpix_mul(rmm1, rmmEcl);
+        extpix_mul(rmm2, rmmEch);
+        extpix_mul(rmm3, rmmEe);
+        extpix_mul(rmm4, rmmEm);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm1, rmm4);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        *rowptr = unextend_pixel(rmm1);
+
+        // advance to next row
+        src += slModulo1;
+        dst += slModulo1;
+    }
+
+    // ----------------------- process lower left corner
+    rowptr = aulRows;
+    extend_pixel(src[-pixCanvasWidth], rmm1);
+    extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
+    extend_pixel(src[0], rmm3);
+    extend_pixel(src[1], rmm4);
+
+    extpix_add(rmm1, rmm4);
+    extpix_mul(rmm1, rmmCe);
+    extpix_mul(rmm2, rmmCc);
+    extpix_mul(rmm3, rmmCm);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm3);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    dst[-pixCanvasWidth] = *rowptr;
+    dst[0] = unextend_pixel(rmm1);
+
+    src++;
+    dst++;
+    rowptr++;
+
+    // ----------------------- process lower edge pixels
+    for (size_t i = pixWidth-2; i != 0; i--)  // lowerLoop
+    {
+        // for each pixel
+        extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+        extend_pixel(src[-pixCanvasWidth], rmm2);
+        extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
+        extend_pixel(src[-1], rmm4);
+        extend_pixel(src[0], rmm5);
+        extend_pixel(src[1], rmm6);
+
+        extpix_add(rmm1, rmm3);
+        extpix_add(rmm4, rmm6);
+        extpix_mul(rmm1, rmmEcl);
+        extpix_mul(rmm2, rmmEe);
+        extpix_mul(rmm4, rmmEch);
+        extpix_mul(rmm5, rmmEm);
+        extpix_add(rmm1, rmm2);
+        extpix_add(rmm1, rmm4);
+        extpix_add(rmm1, rmm5);
+        extpix_adds(rmm1, rmmAdd);
+        extpix_mulhi(rmm1, rmmInvDiv);
+        dst[-pixCanvasWidth] = *rowptr;
+        dst[0] = unextend_pixel(rmm1);
+
+        // advance to next pixel
+        src++;
+        dst++;
+        rowptr++;
+    }
+
+    // ----------------------- lower right corners
+    extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
+    extend_pixel(src[-pixCanvasWidth], rmm2);
+    extend_pixel(src[-1], rmm3);
+    extend_pixel(src[0], rmm4);
+
+    extpix_add(rmm2, rmm3);
+    extpix_mul(rmm1, rmmCc);
+    extpix_mul(rmm2, rmmCe);
+    extpix_mul(rmm4, rmmCm);
+    extpix_add(rmm1, rmm2);
+    extpix_add(rmm1, rmm4);
+    extpix_adds(rmm1, rmmAdd);
+    extpix_mulhi(rmm1, rmmInvDiv);
+    dst[-pixCanvasWidth] = *rowptr;
+    dst[0] = unextend_pixel(rmm1);
+
 #endif

  // all done (finally)
--- a/Sources/Engine/Graphics/OpenGL.h
+++ b/Sources/Engine/Graphics/OpenGL.h
@ -89,13 +89,7 @@ extern void  (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param);
 inline void glCOLOR( COLOR col)
 {
 /* rcg10052001 Platform-wrappers. */
-#if (defined USE_PORTABLE_C)
-	col = ( ((col << 24)            ) |
-            ((col << 8) & 0x00FF0000) |
-            ((col >> 8) & 0x0000FF00) |
-            ((col >> 24)            ) );
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     eax,dword ptr [col]
    bswap   eax
@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col)
  );

 #else
-  #error please define for your platform.
+  col = ( ((col << 24)            ) |
+          ((col << 8) & 0x00FF0000) |
+          ((col >> 8) & 0x0000FF00) |
+          ((col >> 24)            ) );
+
 #endif

  pglColor4ubv((GLubyte*)&col);
--- a/Sources/Engine/Graphics/TextureEffects.cpp
+++ b/Sources/Engine/Graphics/TextureEffects.cpp
@ -32,9 +32,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-#define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
 #define ASMOPT 1
 #elif (defined __GNU_INLINE_X86_32__)
 #define ASMOPT 1
@ -1285,8 +1283,7 @@ static void RenderWater(void)
  { // SUB-SAMPLING
    SLONG slHeightMapStep, slHeightRowStep;

-#if ASMOPT == 1
-  #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      bsf     ecx,D [_pixTexWidth]
@ -1357,7 +1354,7 @@ pixLoop:
      pop     ebx
    }

-  #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
    // rcg12152001 needed extra registers. :(
    _slHeightMapStep_renderWater = slHeightMapStep;
    _pixBaseWidth_renderWater = pixBaseWidth;
@ -1460,10 +1457,6 @@ pixLoop:
          "cc", "memory"
    );

-  #else
-    #error fill in for your platform.
-  #endif
-
 #else

    PIX pixPos, pixDU, pixDV;
--- a/Sources/Engine/Light/LayerMixer.cpp
+++ b/Sources/Engine/Light/LayerMixer.cpp
@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined USE_PORTABLE_C)
-  #define ASMOPT 0
-#elif (defined __MSVC_INLINE__)
-  #define ASMOPT 1
-#elif (defined __GNU_INLINE_X86_32__)
-  #define ASMOPT 1
-#else
-  #define ASMOPT 0
-#endif
-
 extern INDEX shd_bFineQuality;
 extern INDEX shd_iFiltering;
 extern INDEX shd_iDithering;
@ -290,8 +280,7 @@ void CLayerMixer::AddAmbientPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if (ASMOPT == 1)
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -364,7 +353,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -439,10 +428,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );

- #else
-  #error Write inline asm for your platform.
- #endif
-
 #else

    // !!! FIXME WARNING: I have not checked this code, and it could be
@ -496,8 +481,7 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightStep>>=1;


-#if (ASMOPT == 1)
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -576,7 +560,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -660,10 +644,6 @@ skipPixel:
          "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
-
 #else   // Portable C version...

  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -723,8 +703,7 @@ void CLayerMixer::AddDiffusionPoint(void)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if ASMOPT == 1
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -796,7 +775,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -871,10 +850,6 @@ skipPixel:
        : FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
  );

- #else
-  #error Write inline assembly for your platform.
- #endif
-
 #else
  // for each pixel in the shadow map
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -929,8 +904,7 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask)
  _slLightMax<<=7;
  _slLightStep>>=1;

-#if (ASMOPT == 1)
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    // prepare interpolants
    movd    mm0,D [_slL2Row]
@ -1008,7 +982,7 @@ skipPixel:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG tmp1, tmp2;
  __asm__ __volatile__ (
    // prepare interpolants
@ -1091,11 +1065,6 @@ skipPixel:
          "cc", "memory"
  );

- #else
-  #error Write inline ASM for your platform.
-
- #endif
-
 #else

  // for each pixel in the shadow map
@ -1201,8 +1170,7 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
  FLOAT fDL2oDV     = fDDL2oDV + 2*(lm_vStepV%v00);
  //_v00 = v00;

-#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__))
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    fld     D [fDDL2oDU]
    fadd    D [fDDL2oDU]
@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
    fistp   D [_slDDL2oDV]
    fistp   D [_slDDL2oDU]
  }
- #elif (defined __GNU_INLINE_X86_32__)
-    STUBBED("inline asm.");
- #else
-   #error Please write inline assembly for your platform.
- #endif
-
 #else
  fDDL2oDU     *= 2;
  fDDL2oDV     *= 2;
@ -1321,8 +1283,7 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp)
  _pulLayer  = lm_pulShadowMap;
  FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f);

-#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__))
- #if (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __int64 mmRowAdv;
  SLONG fixGRow  = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15
  SLONG slModulo = (lm_pixCanvasSizeU-lm_pixPolygonSizeU) *BYTES_PER_TEXEL;
@ -1436,14 +1397,6 @@ rowNext:
 rowDone:
    emms
  }
- #elif (defined __GNU_INLINE_X86_32__)
-
-    STUBBED("WRITE ME. Argh.");
-
- #else
-  #error Need inline assembly for your platform.
- #endif
-
 #else
  // well, make gradient ...
  SLONG slR0=0,slG0=0,slB0=0;
@ -1528,9 +1481,8 @@ rowDone:
 // apply directional light or ambient to layer
 void CLayerMixer::AddDirectional(void)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  ULONG ulLight = ByteSwap( lm_colLight);
- #if (defined __MSVC_INLINE__)
  __asm {
    // prepare pointers and variables
    mov     edi,D [_pulLayer]
@ -1565,7 +1517,8 @@ rowNext:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1608,10 +1561,6 @@ rowNext:
        : FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory"
  );

- #else
-   #error Write inline assembly for your platform.
- #endif
-
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1631,9 +1580,8 @@ rowNext:
 // apply directional light thru mask to layer
 void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  ULONG ulLight = ByteSwap( lm_colLight);
- #if (defined __MSVC_INLINE__)
  // prepare some local variables
  __asm {
    // prepare pointers and variables
@ -1665,7 +1613,8 @@ skipLight:
    emms
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG ulLight = ByteSwap( lm_colLight);
  ULONG tmp;
  __asm__ __volatile__ (
    // prepare pointers and variables
@ -1706,10 +1655,6 @@ skipLight:
          "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
-
 #else
  UBYTE* pubLayer = (UBYTE*)_pulLayer;
  // for each pixel in the shadow map
@ -1832,7 +1777,33 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    }
  } // set initial color

- #if (defined USE_PORTABLE_C)
+#if (defined __MSVC_INLINE__)
+  __asm {
+    cld
+    mov     ebx,D [this]
+    mov     ecx,D [ebx].lm_pixCanvasSizeU
+    imul    ecx,D [ebx].lm_pixCanvasSizeV
+    mov     edi,D [ebx].lm_pulShadowMap
+    mov     eax,D [colAmbient]
+    bswap   eax
+    rep     stosd
+  }
+
+#elif (defined __GNU_INLINE_X86_32__)
+  ULONG clob1, clob2, clob3;
+  __asm__ __volatile__ (
+    "cld                    \n\t"
+    "imull   %%esi, %%ecx   \n\t"
+    "bswapl  %%eax          \n\t"
+    "rep                    \n\t"
+    "stosl                  \n\t"
+        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
+        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
+          "a" (colAmbient), "D" (this->lm_pulShadowMap)
+        : "cc", "memory"
+  );
+
+#else
  register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV;
  #if PLATFORM_LITTLEENDIAN
  // Forces C fallback; BYTESWAP itself is a no-op on little endian.
@ -1850,35 +1821,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
    ptr++;
  }

- #elif (defined __MSVC_INLINE__)
-  __asm {
-    cld
-    mov     ebx,D [this]
-    mov     ecx,D [ebx].lm_pixCanvasSizeU
-    imul    ecx,D [ebx].lm_pixCanvasSizeV
-    mov     edi,D [ebx].lm_pulShadowMap
-    mov     eax,D [colAmbient]
-    bswap   eax
-    rep     stosd
-  }
-
- #elif (defined __GNU_INLINE_X86_32__)
-  ULONG clob1, clob2, clob3;
-  __asm__ __volatile__ (
-    "cld                    \n\t"
-    "imull   %%esi, %%ecx   \n\t"
-    "bswapl  %%eax          \n\t"
-    "rep                    \n\t"
-    "stosl                  \n\t"
-        : "=a" (clob1), "=c" (clob2), "=D" (clob3)
-        : "c" (this->lm_pixCanvasSizeU), "S" (this->lm_pixCanvasSizeV),
-          "a" (colAmbient), "D" (this->lm_pulShadowMap)
-        : "cc", "memory"
-  );
-
- #else
-  #error Please write inline assembly for your platform.
- #endif
+#endif

  _pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL);

@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::CopyShadowLayer(void)
 {
- #if (defined USE_PORTABLE_C)
-   memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -1967,7 +1908,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
    mov     edi,D [ebx].lm_pulShadowMap
    rep     movsd
  }
- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -1980,21 +1921,16 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
        : "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
+#else
+  memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
+#endif
 }


 // copy from static shadow map to dynamic layer
 __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
 {
- #if (defined USE_PORTABLE_C)
-   DWORD* dst = (DWORD*)lm_pulShadowMap;
-   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
-   DWORD color = __builtin_bswap32(col);
-   while(n--) {*(dst++)=color;}
- #elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     ebx,D [this]
@ -2006,7 +1942,7 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
    rep     stosd
  }

- #elif (defined __GNU_INLINE_X86_32__)
+#elif (defined __GNU_INLINE_X86_32__)
  ULONG clob1, clob2, clob3;
  __asm__ __volatile__ (
    "cld                    \n\t"
@ -2020,9 +1956,12 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
        : "cc", "memory"
  );

- #else
-  #error Please write inline assembly for your platform.
- #endif
+#else
+   DWORD* dst = (DWORD*)lm_pulShadowMap;
+   int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;   
+   DWORD color = __builtin_bswap32(col);
+   while(n--) {*(dst++)=color;}
+#endif
 }


--- a/Sources/Engine/Math/Float.cpp
+++ b/Sources/Engine/Math/Float.cpp
@ -24,18 +24,9 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define _PC_64    0x0300

 // !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap.  --ryan.
-#ifdef USE_PORTABLE_C
-// Fake control87 for USE_PORTABLE_C version
-inline ULONG _control87(WORD newcw, WORD mask)
-{
-    static WORD fpw=_PC_64;
-    if (mask != 0)
-    {
-        fpw &= ~mask;
-        fpw |= (newcw & mask);
-    }
-    return(fpw);
-}
+#if (defined _MSC_VER)
+
+// _control87 is provided by the compiler

 #elif (defined __GNU_INLINE_X86_32__)

@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask)
    return(fpw);
 }

-#elif (!defined _MSC_VER)
-#error Implement for your platform, or add a stub conditional here.
+#else
+
+// Fake control87 for USE_PORTABLE_C version
+inline ULONG _control87(WORD newcw, WORD mask)
+{
+    static WORD fpw=_PC_64;
+    if (mask != 0)
+    {
+        fpw &= ~mask;
+        fpw |= (newcw & mask);
+    }
+    return(fpw);
+}
+
 #endif

 /* Get current precision setting of FPU. */
--- a/Sources/Engine/Math/Functions.h
+++ b/Sources/Engine/Math/Functions.h
@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul)
 // fast float to int conversion
 inline SLONG FloatToInt( FLOAT f)
 {
-#if defined(__arm__) || defined(USE_PORTABLE_C)
-  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
-  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
-  return((SLONG) (f + addToRound));
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    fld    D [f]
@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f)
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+  // round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
+  float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
+  return((SLONG) (f + addToRound));
+
 #endif
 }

 // log base 2 of any float numero
 inline FLOAT Log2( FLOAT f) {
-#if (defined USE_PORTABLE_C) || defined(__arm__)
-  return log2f(f);
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  FLOAT fRet;
  _asm {
    fld1
@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) {
  );
  return(fRet);
 #else
-  #error Fill this in for your platform.
+  return log2f(f);
+
 #endif
 }

@ -376,25 +372,7 @@ inline FLOAT Log2( FLOAT f) {
 // returns accurate values only for integers that are power of 2
 inline SLONG FastLog2( SLONG x)
 {
-#if (defined USE_PORTABLE_C)
-#ifdef __GNUC__
-  if(x == 0) return 0; // __builtin_clz() is undefined for 0
-  int numLeadingZeros  = __builtin_clz(x);
-  return 31 - numLeadingZeros;
-#else
-  register SLONG val = x;
-  register SLONG retval = 31;
-  while (retval > 0)
-  {
-    if (val & (1 << retval))
-        return retval;
-    retval--;
-  }
-
-  return 0;
-#endif
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    bsr   eax,D [x]
@ -411,8 +389,21 @@ inline SLONG FastLog2( SLONG x)
        : "memory"
  );
  return(slRet);
+#elif (defined __GNUC__)
+  if(x == 0) return 0; // __builtin_clz() is undefined for 0
+  int numLeadingZeros  = __builtin_clz(x);
+  return 31 - numLeadingZeros;
 #else
-  #error Fill this in for your platform.
+  register SLONG val = x;
+  register SLONG retval = 31;
+  while (retval > 0)
+  {
+    if (val & (1 << retval))
+        return retval;
+    retval--;
+  }
+
+  return 0;
 #endif
 }

@ -420,11 +411,7 @@ inline SLONG FastLog2( SLONG x)
 // returns log2 of first larger value that is a power of 2
 inline SLONG FastMaxLog2( SLONG x)
 { 
-#if (defined USE_PORTABLE_C)
-printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
-  return((SLONG) log2((double) x));
-
-#elif (defined __MSVC_INLINE__)
+#if (defined __MSVC_INLINE__)
  SLONG slRet;
  __asm {
    bsr   eax,D [x]
@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
  );
  return(slRet);
 #else
-  #error Fill this in for your platform.
+printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
+  return((SLONG) log2((double) x));
+
 #endif
 }
 */
--- a/Sources/Engine/Models/RenderModel_View.cpp
+++ b/Sources/Engine/Models/RenderModel_View.cpp
@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
 #define W  word ptr
 #define B  byte ptr

-#if (defined __MSVC_INLINE__)
-#define ASMOPT 1
-#elif (defined __GNU_INLINE_X86_32__)
-#define ASMOPT 0  // !!! FIXME: rcg10112001 Write GCC inline asm versions...
-#else
-#define ASMOPT 0
-#endif
-

 extern BOOL CVA_bModels;
 extern BOOL GFX_bTruform;
@ -663,7 +655,7 @@ static FLOAT   _fHazeAdd;
 // check vertex against fog
 static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tex]
@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
 // check vertex against haze
 static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1)
 {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [vtx]
    mov     edi,D [tx1]
@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals)
    const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1;
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1196,7 +1188,7 @@ vtxNext16:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1365,7 +1357,7 @@ vtxNext16L:
    // if no lerping
    if( pFrame0==pFrame1)
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // for each vertex in mip
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
@ -1464,7 +1456,7 @@ vtxNext8:
    // if lerping
    else
    {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
      SLONG slTmp1, slTmp2, slTmp3;
      // re-adjust stretching factors because of fixint lerping (divide by 256)
@ -1610,7 +1602,7 @@ vtxNext8L:
  }

  // generate colors from shades
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
  __asm {
    pxor    mm0,mm0
    // construct 64-bit RGBA light
@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm)
    pvtxSrfBase = &_avtxSrfBase[iSrfVx0];
    INDEX iSrfVx;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [puwSrfToMip]
@ -2074,7 +2066,7 @@ srfVtxLoop:
    const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation);
    colSrfDiff.MultiplyRGBA( colD, colMdlDiff);

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    // setup texcoord array
    __asm {
      push    ebx
@ -2134,7 +2126,7 @@ vtxEnd:
      for( INDEX iSrfVx=0; iSrfVx<ctSrfVx; iSrfVx++) pcolSrfBase[iSrfVx] = colSrfDiffAdj;
    }
    else {
-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
      // setup color array
      const COLOR colS = colSrfDiff.ul.abgr;
      __asm {
@ -2335,7 +2327,7 @@ diffColLoop:
    // cache rotation
    const FLOATmatrix3D &m = rm.rm_mObjectRotation;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
@ -2530,7 +2522,7 @@ reflMipLoop:
    // cache object view rotation
    const FLOATmatrix3D &m = rm.rm_mObjectToView;

-#if ASMOPT == 1
+#if (defined __MSVC_INLINE__)
    __asm {
      push    ebx
      mov     ebx,D [m]
--- a/Sources/Engine/Rendering/RendMisc.cpp
+++ b/Sources/Engine/Rendering/RendMisc.cpp
@ -105,10 +105,7 @@ static SLONG slTmp;

 static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
 {
- #if (defined USE_PORTABLE_C)
-  return((PIX) (f+0.9999f));
-
- #elif (defined __MSVC_INLINE__)
+ #if (defined __MSVC_INLINE__)
  PIX pixRet;
  __asm {
    fld     dword ptr [f]
@ -142,7 +139,8 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
  return pixRet;

 #else
-  #error Please write inline ASM for your platform.
+  return((PIX) (f+0.9999f));
+
 #endif
 }

--- a/Sources/Engine/Sound/SoundMixer.cpp
+++ b/Sources/Engine/Sound/SoundMixer.cpp
@ -43,17 +43,15 @@ static CSoundData *psd;

 // nasm on MacOS X is getting wrong addresses of external globals, so I have
 //  to define them in the .asm file...lame.
-#ifdef __GNU_INLINE_X86_32__
-#ifdef USE_PORTABLE_C
-#define INASM 
-#else
+#if (defined __GNU_INLINE_X86_32__)
 #define INASM extern
-#endif
-#else
+#elif (defined __MSVC_INLINE__)
 #define INASM static
 static __int64 mmInvFactor   = 0x00007FFF00007FFF;
 static FLOAT f65536 = 65536.0f;
 static FLOAT f4G    = 4294967296.0f;
+#else
+#define INASM static
 #endif

 INASM SLONG slMixerBufferSize;        // size in samples per channel of the destination buffers
@ -81,11 +79,7 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
  slMixerBufferSampleRate = _pSound->sl_SwfeFormat.nSamplesPerSec;

  // wipe destination mixer buffer
-  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
-  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    xor     eax,eax
@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
    shl     ecx,1 // *2 because of 32-bit src format
    rep     stosd
  }
-  #elif (defined __GNU_INLINE_X86_32__)
-  // !!! FIXME : rcg12172001 Is this REALLY any faster than memset()?
-  ULONG clob1, clob2;
-  __asm__ __volatile__ (
-    "cld                  \n\t"
-    "rep                  \n\t"
-    "stosl                \n\t"
-        : "=D" (clob1), "=c" (clob2)
-        : "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2)
-        : "cc", "memory"
-  );
  #else
-    #error please write inline asm for your platform.
+  memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
  #endif
 }

@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;

-  #if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
-  // (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
-  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [slSrcOffset]
@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
    shr     ecx,2   // bytes to samples per channel
    rep     movsd
  }
-  #elif (defined __GNU_INLINE_X86_32__)
-  // !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()?
-  ULONG clob1, clob2, clob3;
-  __asm__ __volatile__ (
-    "cld                 \n\t"
-    "rep                 \n\t"
-    "movsl               \n\t"
-      : "=S" (clob1), "=D" (clob2), "=c" (clob3)
-      : "S" (((char *)pvMixerBuffer) + slSrcOffset),
-        "D" (pDstBuffer),
-        "c" (slBytes >> 2)
-      : "cc", "memory"
-  );
  #else
-  #error please write inline asm for your platform.
+  memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
  #endif
 }

@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON
  ASSERT( slBytes%2==0);
  if( slBytes<4) return;

-  #if (defined USE_PORTABLE_C)
-  // (This is untested, currently. --ryan.)
-  WORD *dest = (WORD *) pDstBuffer;
-  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
-  SLONG max = slBytes / 4;
-  for (SLONG i = 0; i < max; i++) {
-      *dest = *src;
-      dest++;    // move 16 bits.
-      src+=2;    // move 32 bits.
-  }
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    mov     esi,D [slSrcOffset]
    add     esi,D [pvMixerBuffer]
@ -204,7 +160,15 @@ copyLoop:
  );

  #else
-  #error please write inline asm for your platform.
+  // (This is untested, currently. --ryan.)
+  WORD *dest = (WORD *) pDstBuffer;
+  WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
+  SLONG max = slBytes / 4;
+  for (SLONG i = 0; i < max; i++) {
+      *dest = *src;
+      dest++;    // move 16 bits.
+      src+=2;    // move 32 bits.
+  }
  #endif
 }

@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes)
  ASSERT( slBytes%4==0);
  if( slBytes<4) return;

-  #if (defined USE_PORTABLE_C)
-  //STUBBED("ConvertMixerBuffer");
-  SWORD *dest = (SWORD *) pvMixerBuffer;
-  SLONG *src = (SLONG *) pvMixerBuffer;
-  SLONG max = slBytes / 2;
-  int tmp;
-  for (SLONG i = 0; i < max; i++) {
-      tmp = *src;
-      if (tmp>32767) tmp=32767;
-      if (tmp<-32767) tmp=-32767;
-      *dest=tmp;
-      dest++;    // move 16 bits.
-      src++;     // move 32 bits.
-  }
-
-
-
-  #elif (defined __MSVC_INLINE__)
+  #if (defined __MSVC_INLINE__)
  __asm {
    cld
    mov     esi,D [pvMixerBuffer]
@ -271,7 +218,20 @@ copyLoop:
  );

  #else
-  #error please write inline asm for your platform.
+
+  SWORD *dest = (SWORD *) pvMixerBuffer;
+  SLONG *src = (SLONG *) pvMixerBuffer;
+  SLONG max = slBytes / 2;
+  int tmp;
+  for (SLONG i = 0; i < max; i++) {
+      tmp = *src;
+      if (tmp>32767) tmp=32767;
+      if (tmp<-32767) tmp=-32767;
+      *dest=tmp;
+      dest++;    // move 16 bits.
+      src++;     // move 32 bits.
+  }
+
  #endif
 }

@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso)
 {
  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);

- #if (defined USE_PORTABLE_C)
-  // initialize some local vars
-  SLONG slLeftSample, slRightSample, slNextSample;
-  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
-  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
-  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
-  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
-  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
-  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
-  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
-
-  SLONG slLeftVolume_ = slLeftVolume >> 16;
-  SLONG slRightVolume_ = slRightVolume >> 16;
-
-  // loop thru source buffer
-  INDEX iCt = slMixerBufferSize;
-  FOREVER
-  {
-    // if left channel source sample came to end of sample buffer
-    if( fixLeftOfs >= fixSoundBufferSize) {
-      fixLeftOfs -= fixSoundBufferSize;
-      // if has no loop, end it
-      bEndOfSound = bNotLoop;
-    }
-    // if right channel source sample came to end of sample buffer
-    if( fixRightOfs >= fixSoundBufferSize) {
-      fixRightOfs -= fixSoundBufferSize;
-      // if has no loop, end it
-      bEndOfSound = bNotLoop;
-    }
-    // end of buffer?
-    if( iCt<=0 || bEndOfSound) break;
-
-    // fetch one lineary interpolated sample on left channel
-    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
-    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
-    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
-    // fetch one lineary interpolated sample on right channel
-    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
-    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
-    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
-
-    // filter samples
-    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
-    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
-
-    // apply stereo volume to current sample
-    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
-    slRightSample = (slLastRightSample * slRightVolume_)>>15;
-
-    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
-    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
-
-    // mix in current sample
-    slLeftSample  += pslDstBuffer[0];
-    slRightSample += pslDstBuffer[1];
-    // upper clamp
-    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
-    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
-    // lower clamp
-    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
-    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
-
-    // store samples (both channels)
-    pslDstBuffer[0] = slLeftSample;
-    pslDstBuffer[1] = slRightSample;
-
-    // modify volume  `
-    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
-    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
-
-    // advance to next sample
-    fixLeftOfs   += fixLeftStep;
-    fixRightOfs  += fixRightStep;
-    pslDstBuffer += 2;
-    iCt--;
-  }
-
- #elif (defined __MSVC_INLINE__)
+ #if (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -553,19 +435,6 @@ loopEnd:
   MixMono_asm(pso);

 #else
-   #error please write inline asm for your platform.
- #endif
-
-  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
-}
-
-
-// mixes one stereo 16-bit signed sound to destination buffer
-inline void MixStereo( CSoundObject *pso)
-{
-  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
-
- #if (defined USE_PORTABLE_C)
  // initialize some local vars
  SLONG slLeftSample, slRightSample, slNextSample;
  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso)
    if( iCt<=0 || bEndOfSound) break;

    // fetch one lineary interpolated sample on left channel
-    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
-    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
+    slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
+    slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
    // fetch one lineary interpolated sample on right channel
-    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
-    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
+    slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
+    slNextSample  = pswSrcBuffer[(fixRightOfs>>16)+1];
    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;

    // filter samples
@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso)
    iCt--;
  }

- #elif (defined __MSVC_INLINE__)
+ #endif
+
+  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
+}
+
+
+// mixes one stereo 16-bit signed sound to destination buffer
+inline void MixStereo( CSoundObject *pso)
+{
+  _pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
+
+ #if (defined __MSVC_INLINE__)
  __asm {
    // convert from floats to fixints 32:16
    fld     D [fLeftOfs]
@ -783,7 +663,83 @@ loopEnd:
   MixStereo_asm(pso);

 #else
-   #error please write inline asm for your platform.
+  // initialize some local vars
+  SLONG slLeftSample, slRightSample, slNextSample;
+  SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
+  fixLeftOfs   = (__int64)(fLeftOfs   * 65536.0);
+  fixRightOfs  = (__int64)(fRightOfs  * 65536.0);
+  __int64 fixLeftStep  = (__int64)(fLeftStep  * 65536.0);
+  __int64 fixRightStep = (__int64)(fRightStep * 65536.0);
+  __int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
+  mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
+
+  SLONG slLeftVolume_ = slLeftVolume >> 16;
+  SLONG slRightVolume_ = slRightVolume >> 16;
+
+  // loop thru source buffer
+  INDEX iCt = slMixerBufferSize;
+  FOREVER
+  {
+    // if left channel source sample came to end of sample buffer
+    if( fixLeftOfs >= fixSoundBufferSize) {
+      fixLeftOfs -= fixSoundBufferSize;
+      // if has no loop, end it
+      bEndOfSound = bNotLoop;
+    }
+    // if right channel source sample came to end of sample buffer
+    if( fixRightOfs >= fixSoundBufferSize) {
+      fixRightOfs -= fixSoundBufferSize;
+      // if has no loop, end it
+      bEndOfSound = bNotLoop;
+    }
+    // end of buffer?
+    if( iCt<=0 || bEndOfSound) break;
+
+    // fetch one lineary interpolated sample on left channel
+    slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
+    slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
+    slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
+    // fetch one lineary interpolated sample on right channel
+    slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
+    slNextSample  = pswSrcBuffer[(fixRightOfs>>15)+2];
+    slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
+
+    // filter samples
+    slLastLeftSample  += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
+    slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
+
+    // apply stereo volume to current sample
+    slLeftSample  = (slLastLeftSample  * slLeftVolume_) >>15;
+    slRightSample = (slLastRightSample * slRightVolume_)>>15;
+
+    slLeftSample  ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
+    slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
+
+    // mix in current sample
+    slLeftSample  += pslDstBuffer[0];
+    slRightSample += pslDstBuffer[1];
+    // upper clamp
+    if( slLeftSample  > MAX_SWORD) slLeftSample  = MAX_SWORD;
+    if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
+    // lower clamp
+    if( slLeftSample  < MIN_SWORD) slLeftSample  = MIN_SWORD;
+    if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
+
+    // store samples (both channels)
+    pslDstBuffer[0] = slLeftSample;
+    pslDstBuffer[1] = slRightSample;
+
+    // modify volume  `
+    slLeftVolume  += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
+    slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
+
+    // advance to next sample
+    fixLeftOfs   += fixLeftStep;
+    fixRightOfs  += fixRightStep;
+    pslDstBuffer += 2;
+    iCt--;
+  }
+
 #endif

  _pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);