rework asm to always fall back to portable C code

with this there is no need to worry about x86 asm switch for other
platforms.
This commit is contained in:
notaz 2016-04-24 20:16:04 +03:00
parent 78b26698ac
commit 1f70d4e242
19 changed files with 790 additions and 1018 deletions

View File

@ -65,8 +65,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#else
#warning "UNKNOWN PLATFORM IDENTIFIED!!!!"
#define PLATFORM_UNKNOWN 1
#warning "USING PORTABLE C!!!"
#define USE_PORTABLE_C
#endif
#if PLATFORM_LINUX || PLATFORM_MACOSX

View File

@ -21,24 +21,13 @@ with this program; if not, write to the Free Software Foundation, Inc.,
template class CStaticArray<CProfileCounter>;
template class CStaticArray<CProfileTimer>;
#if (defined USE_PORTABLE_C)
#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
#include <sys/time.h>
#endif
static inline __int64 ReadTSC_profile(void)
{
#if (defined USE_PORTABLE_C)
#ifdef __arm__
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
#else
struct timeval tv;
gettimeofday(&tv, NULL);
return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
#endif
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__int64 mmRet;
__asm {
rdtsc
@ -60,7 +49,16 @@ static inline __int64 ReadTSC_profile(void)
return(mmRet);
#else
#error Please implement for your platform/compiler.
#ifdef __arm__
struct timespec tv;
clock_gettime(CLOCK_MONOTONIC, &tv);
return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_nsec) / 1000000) );
#else
struct timeval tv;
gettimeofday(&tv, NULL);
return( (((__int64) tv.tv_sec) * 1000) + (((__int64) tv.tv_usec) / 1000) );
#endif
#endif
}

View File

@ -29,7 +29,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#include <Engine/Base/Priority.inl>
// !!! FIXME: use SDL timer code instead and rdtsc never?
#if (USE_PORTABLE_C)
#if (defined PLATFORM_UNIX) && !defined(__GNU_INLINE_X86_32__)
#define USE_GETTIMEOFDAY 1
#endif

View File

@ -229,10 +229,7 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
inline ULONG _rotl(ULONG ul, int bits)
{
#if (defined USE_PORTABLE_C)
// DG: according to http://blog.regehr.org/archives/1063 this is fast
return (ul<<bits) | (ul>>(-bits&31));
#elif (defined __GNU_INLINE_X86_32__)
#if (defined __GNU_INLINE_X86_32__)
// This, on the other hand, is wicked fast. :)
__asm__ __volatile__ (
"roll %%cl, %%eax \n\t"
@ -254,7 +251,8 @@ MY_STATIC_ASSERT(size_tSize, sizeof(size_t) == sizeof(void*));
return(ul);
#else
#error need inline asm for your platform.
// DG: according to http://blog.regehr.org/archives/1063 this is fast
return (ul<<bits) | (ul>>(-bits&31));
#endif
}

View File

@ -125,14 +125,10 @@ BOOL APIENTRY DllMain( HANDLE hModule, DWORD ul_reason_for_call, LPVOID lpReser
static void DetectCPU(void)
{
#if (defined USE_PORTABLE_C) // rcg10072001
CPrintF(TRANSV(" (No CPU detection in this binary.)\n"));
#else
char strVendor[12+1];
char strVendor[12+1] = { 0 };
strVendor[12] = 0;
ULONG ulTFMS;
ULONG ulFeatures;
ULONG ulTFMS = 0;
ULONG ulFeatures = 0;
#if (defined __MSVC_INLINE__)
// test MMX presence and update flag
@ -181,10 +177,13 @@ static void DetectCPU(void)
: "eax", "ecx", "edx", "memory"
);
#else
#error Please implement for your platform or define USE_PORTABLE_C.
#endif
if (ulTFMS == 0) {
CPrintF(TRANSV(" (No CPU detection in this binary.)\n"));
return;
}
INDEX iType = (ulTFMS>>12)&0x3;
INDEX iFamily = (ulTFMS>> 8)&0xF;
INDEX iModel = (ulTFMS>> 4)&0xF;
@ -215,8 +214,6 @@ static void DetectCPU(void)
sys_iCPUMHz = INDEX(_pTimer->tm_llCPUSpeedHZ/1E6);
if( !bMMX) FatalError( TRANS("MMX support required but not present!"));
#endif // defined USE_PORTABLE_C
}
static void DetectCPUWrapper(void)

View File

@ -247,30 +247,7 @@ COLOR MulColors( COLOR col1, COLOR col2)
if( col2==0xFFFFFFFF) return col1;
if( col1==0 || col2==0) return 0;
#if (defined USE_PORTABLE_C)
// !!! FIXME: This...is not fast.
union
{
COLOR col;
UBYTE bytes[4];
} conv1;
union
{
COLOR col;
UBYTE bytes[4];
} conv2;
conv1.col = col1;
conv2.col = col2;
conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
return(conv1.col);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
COLOR colRet;
__asm {
xor ebx,ebx
@ -433,20 +410,6 @@ COLOR MulColors( COLOR col1, COLOR col2)
return colRet;
#else
#error please fill in inline assembly for your platform.
#endif
}
// fast color additon function - RES = clamp (1ST + 2ND)
COLOR AddColors( COLOR col1, COLOR col2)
{
if( col1==0) return col2;
if( col2==0) return col1;
if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
COLOR colRet;
#if (defined USE_PORTABLE_C)
// !!! FIXME: This...is not fast.
union
{
@ -459,19 +422,28 @@ COLOR AddColors( COLOR col1, COLOR col2)
COLOR col;
UBYTE bytes[4];
} conv2;
#define MINVAL(a, b) ((a)>(b))?(b):(a)
conv1.col = col1;
conv2.col = col2;
conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
#undef MINVAL
conv1.bytes[0] = (UBYTE) ((((DWORD) conv1.bytes[0]) * ((DWORD) conv2.bytes[0])) / 255);
conv1.bytes[1] = (UBYTE) ((((DWORD) conv1.bytes[1]) * ((DWORD) conv2.bytes[1])) / 255);
conv1.bytes[2] = (UBYTE) ((((DWORD) conv1.bytes[2]) * ((DWORD) conv2.bytes[2])) / 255);
conv1.bytes[3] = (UBYTE) ((((DWORD) conv1.bytes[3]) * ((DWORD) conv2.bytes[3])) / 255);
colRet = conv1.col;
return(conv1.col);
#endif
}
#elif (defined __MSVC_INLINE__)
// fast color additon function - RES = clamp (1ST + 2ND)
COLOR AddColors( COLOR col1, COLOR col2)
{
if( col1==0) return col2;
if( col2==0) return col1;
if( col1==0xFFFFFFFF || col2==0xFFFFFFFF) return 0xFFFFFFFF;
COLOR colRet;
#if (defined __MSVC_INLINE__)
__asm {
xor ebx,ebx
mov esi,255
@ -608,7 +580,29 @@ COLOR AddColors( COLOR col1, COLOR col2)
);
#else
#error please fill in inline assembly for your platform.
// !!! FIXME: This...is not fast.
union
{
COLOR col;
UBYTE bytes[4];
} conv1;
union
{
COLOR col;
UBYTE bytes[4];
} conv2;
#define MINVAL(a, b) ((a)>(b))?(b):(a)
conv1.col = col1;
conv2.col = col2;
conv1.bytes[0] = (UBYTE) MINVAL((((WORD) conv1.bytes[0]) + ((WORD) conv2.bytes[0])) , 255);
conv1.bytes[1] = (UBYTE) MINVAL((((WORD) conv1.bytes[1]) + ((WORD) conv2.bytes[1])) , 255);
conv1.bytes[2] = (UBYTE) MINVAL((((WORD) conv1.bytes[2]) + ((WORD) conv2.bytes[2])) , 255);
conv1.bytes[3] = (UBYTE) MINVAL((((WORD) conv1.bytes[3]) + ((WORD) conv2.bytes[3])) , 255);
#undef MINVAL
colRet = conv1.col;
#endif
return colRet;
@ -619,14 +613,7 @@ COLOR AddColors( COLOR col1, COLOR col2)
// multiple conversion from OpenGL color to DirectX color
extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct)
{
#if (defined USE_PORTABLE_C)
//#error write me.
for (int i=0; i<ct; i++) {
ULONG tmp = pulSrc[i];
pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
mov esi,dword ptr [pulSrc]
mov edi,dword ptr [pulDst]
@ -678,12 +665,12 @@ colSkip2:
mov dword ptr [edi],eax
colSkip1:
}
#elif (defined __GNU_INLINE_X86_32__)
STUBBED("convert to inline asm.");
#else
#error please fill in inline assembly for your platform.
for (int i=0; i<ct; i++) {
ULONG tmp = pulSrc[i];
pulDst[i] = (tmp&0xff00ff00) | ((tmp&0x00ff0000)>>16) | ((tmp&0x000000ff)<<16);
}
#endif
}

View File

@ -204,19 +204,7 @@ ENGINE_API extern COLOR AddColors( COLOR col1, COLOR col2); // fast color addito
__forceinline ULONG ByteSwap( ULONG ul)
{
/* rcg10052001 Platform-wrappers. */
#if (defined USE_PORTABLE_C)
ul = ( ((ul << 24) ) |
((ul << 8) & 0x00FF0000) |
((ul >> 8) & 0x0000FF00) |
((ul >> 24) ) );
#if (defined PLATFORM_BIGENDIAN)
BYTESWAP(ul); // !!! FIXME: May not be right!
#endif
return(ul);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
ULONG ulRet;
__asm {
mov eax,dword ptr [ul]
@ -234,16 +222,22 @@ __forceinline ULONG ByteSwap( ULONG ul)
return(ul);
#else
#error please define for your platform.
ul = ( ((ul << 24) ) |
((ul << 8) & 0x00FF0000) |
((ul >> 8) & 0x0000FF00) |
((ul >> 24) ) );
#if (defined PLATFORM_BIGENDIAN)
BYTESWAP(ul); // !!! FIXME: May not be right!
#endif
return(ul);
#endif
}
__forceinline ULONG rgba2argb( ULONG ul)
{
#if (defined USE_PORTABLE_C)
return( (ul << 24) | (ul >> 8) );
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
ULONG ulRet;
__asm {
mov eax,dword ptr [ul]
@ -263,21 +257,14 @@ __forceinline ULONG rgba2argb( ULONG ul)
return ulRet;
#else
#error please define for your platform.
return (ul << 24) | (ul >> 8);
#endif
}
__forceinline ULONG abgr2argb( COLOR col)
{
#if (defined USE_PORTABLE_C)
// this could be simplified, this is just a safe conversion from asm code
col = ( ((col << 24) ) |
((col << 8) & 0x00FF0000) |
((col >> 8) & 0x0000FF00) |
((col >> 24) ) );
return( (col << 24) | (col >> 8) );
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
ULONG ulRet;
__asm {
mov eax,dword ptr [col]
@ -299,7 +286,13 @@ __forceinline ULONG abgr2argb( COLOR col)
return ulRet;
#else
#error please define for your platform.
// this could be simplified, this is just a safe conversion from asm code
col = ( ((col << 24) ) |
((col << 8) & 0x00FF0000) |
((col >> 8) & 0x0000FF00) |
((col >> 24) ) );
return( (col << 24) | (col >> 8) );
#endif
}
@ -311,10 +304,7 @@ extern void abgr2argb( ULONG *pulSrc, ULONG *pulDst, INDEX ct);
// fast memory copy of ULONGs
inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
{
#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
memcpy( pulDst, pulSrc, ctLongs*4);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov esi,dword ptr [pulSrc]
@ -322,23 +312,8 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
mov ecx,dword ptr [ctLongs]
rep movsd
}
#elif (defined __GNU_INLINE_X86_32__)
// I haven't benchmarked it, but in many cases, memcpy() becomes an
// inline (asm?) macro on GNU platforms, so this might not be a
// speed gain at all over the USE_PORTABLE_C version.
// You Have Been Warned. --ryan.
__asm__ __volatile__ (
"cld \n\t"
"rep \n\t"
"movsd \n\t"
: "=S" (pulSrc), "=D" (pulDst), "=c" (ctLongs)
: "S" (pulSrc), "D" (pulDst), "c" (ctLongs)
: "cc", "memory"
);
#else
# error Please fill this in for your platform.
memcpy( pulDst, pulSrc, ctLongs*4);
#endif
}
@ -346,11 +321,7 @@ inline void CopyLongs( ULONG *pulSrc, ULONG *pulDst, INDEX ctLongs)
// fast memory set of ULONGs
inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
{
#if (defined USE_PORTABLE_C)
for( INDEX i=0; i<ctLongs; i++)
pulDst[i] = ulVal;
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov eax,dword ptr [ulVal]
@ -370,7 +341,9 @@ inline void StoreLongs( ULONG ulVal, ULONG *pulDst, INDEX ctLongs)
);
#else
# error Please fill this in for your platform.
for( INDEX i=0; i<ctLongs; i++)
pulDst[i] = ulVal;
#endif
}

View File

@ -38,16 +38,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define W word ptr
#define B byte ptr
#if (defined USE_PORTABLE_C)
#define ASMOPT 0
#elif (defined __MSVC_INLINE__)
#define ASMOPT 1
#elif (defined __GNU_INLINE_X86_32__)
#define ASMOPT 1
#else
#define ASMOPT 0
#endif
#define MAXTEXUNITS 4
#define SHADOWTEXTURE 3
@ -153,7 +143,6 @@ void AddElements( ScenePolygon *pspo)
const INDEX ctElems = pspo->spo_ctElements;
INDEX *piDst = _aiElements.Push(ctElems);
#if (ASMOPT == 1)
#if (defined __MSVC_INLINE__)
__asm {
mov eax,D [pspo]
@ -219,11 +208,6 @@ elemDone:
"cc", "memory"
);
#else
#error Please write inline ASM for your platform.
#endif
#else
const INDEX iVtx0Pass = pspo->spo_iVtx0Pass;
const INDEX *piSrc = pspo->spo_piElements;
@ -495,8 +479,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
// determine maximum used groups
ASSERT( _ctGroupsCount);
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
mov eax,2
@ -516,11 +498,6 @@ static void RSBinToGroups( ScenePolygon *pspoFirst)
: "eax", "ecx", "cc", "memory"
);
#else
#error Please write inline ASM for your platform.
#endif
#else
// emulate x86's bsr opcode...not fast. :/
register DWORD val = _ctGroupsCount;
@ -858,9 +835,6 @@ static void RSSetTextureCoords( ScenePolygon *pspoGroup, INDEX iLayer, INDEX iUn
continue;
}
// !!! FIXME: rcg11232001 This inline conversion is broken. Use the
// !!! FIXME: rcg11232001 C version for now with GCC.
#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__) && (!defined __INTEL_COMPILER))
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [pspo]
@ -956,11 +930,6 @@ vtxLoop:
);
*/
#else
#error Please write inline ASM for your platform.
#endif
#else
// diffuse mapping

View File

@ -67,18 +67,7 @@ ULONG PrepareTexture( UBYTE *pubTexture, PIX pixSizeI, PIX pixSizeJ)
// need to upload from RGBA format
const PIX pixTextureSize = pixSizeI*pixSizeJ;
#if (defined USE_PORTABLE_C)
const UBYTE* src = pubTexture;
DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
for (int i=0; i<pixTextureSize; i++) {
const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
*dst = ((tmp << 24) & 0xff000000 ) | ((tmp << 8) & 0x00ff0000 ) |
((tmp >> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
src++;
dst++;
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [pubTexture]
mov edi,D [pubTexture]
@ -116,7 +105,15 @@ pixLoop:
);
#else
#error Write inline ASM for your platform.
const UBYTE* src = pubTexture;
DWORD* dst = (DWORD*)(pubTexture+pixTextureSize);
for (int i=0; i<pixTextureSize; i++) {
const DWORD tmp = ((DWORD)*src) | 0xFFFFFF00;
*dst = ((tmp << 24) & 0xff000000 ) | ((tmp << 8) & 0x00ff0000 ) |
((tmp >> 8) & 0x0000ff00 ) | ((tmp >> 24) & 0x000000ff );
src++;
dst++;
}
#endif

View File

@ -169,32 +169,7 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
if( pixSizeV==0) pixSizeV=1;
pixSize = pixSizeU*pixSizeV;
#if (defined USE_PORTABLE_C)
// Basically average every other pixel...
UWORD w = 0;
UBYTE *dptr = (UBYTE *) pulDst;
UBYTE *sptr = (UBYTE *) pulSrc;
#if 0
pixSize *= 4;
for (PIX i = 0; i < pixSize; i++)
{
*dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
dptr++;
sptr += 2;
}
#else
for (PIX i = 0; i < pixSize; i++)
{
for (PIX j = 0; j < 4; j++)
{
*dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
dptr++;
sptr++;
}
sptr += 4;
}
#endif
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
pxor mm0,mm0
mov esi,D [pulSrc]
@ -244,7 +219,30 @@ void UploadTexture_OGL( ULONG *pulTexture, PIX pixSizeU, PIX pixSizeV,
);
#else
#error Please write inline ASM for your platform.
// Basically average every other pixel...
UWORD w = 0;
UBYTE *dptr = (UBYTE *) pulDst;
UBYTE *sptr = (UBYTE *) pulSrc;
#if 0
pixSize *= 4;
for (PIX i = 0; i < pixSize; i++)
{
*dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[1])) >> 1 );
dptr++;
sptr += 2;
}
#else
for (PIX i = 0; i < pixSize; i++)
{
for (PIX j = 0; j < 4; j++)
{
*dptr = (UBYTE) ( (((UWORD) sptr[0]) + ((UWORD) sptr[4])) >> 1 );
dptr++;
sptr++;
}
sptr += 4;
}
#endif
#endif
// upload mipmap

View File

@ -209,58 +209,7 @@ static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidt
if( bBilinear) // type of filtering?
{ // BILINEAR
#if (defined USE_PORTABLE_C)
UBYTE *src = (UBYTE *) pulSrcMipmap;
UBYTE *dest = (UBYTE *) pulDstMipmap;
for (int i = 0 ; i < pixHeight; i++)
{
for (int j = 0; j < pixWidth; j++)
{
// Grab pixels from image
UWORD upleft[4];
UWORD upright[4];
UWORD downleft[4];
UWORD downright[4];
upleft[0] = *(src + 0);
upleft[1] = *(src + 1);
upleft[2] = *(src + 2);
upleft[3] = *(src + 3);
upright[0] = *(src + 4);
upright[1] = *(src + 5);
upright[2] = *(src + 6);
upright[3] = *(src + 7);
downleft[0] = *(src + pixWidth*8 + 0);
downleft[1] = *(src + pixWidth*8 + 1);
downleft[2] = *(src + pixWidth*8 + 2);
downleft[3] = *(src + pixWidth*8 + 3);
downright[0] = *(src + pixWidth*8 + 4);
downright[1] = *(src + pixWidth*8 + 5);
downright[2] = *(src + pixWidth*8 + 6);
downright[3] = *(src + pixWidth*8 + 7);
UWORD answer[4];
answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2;
answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2;
answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2;
answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2;
answer[0] /= 4;
answer[1] /= 4;
answer[2] /= 4;
answer[3] /= 4;
*(dest + 0) = answer[0];
*(dest + 1) = answer[1];
*(dest + 2) = answer[2];
*(dest + 3) = answer[3];
src += 8;
dest += 4;
}
src += 8*pixWidth;
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
pxor mm0,mm0
mov ebx,D [pixWidth]
@ -346,43 +295,63 @@ pixLoopN:
);
#else
#error Write inline asm for your platform.
UBYTE *src = (UBYTE *) pulSrcMipmap;
UBYTE *dest = (UBYTE *) pulDstMipmap;
for (int i = 0 ; i < pixHeight; i++)
{
for (int j = 0; j < pixWidth; j++)
{
// Grab pixels from image
UWORD upleft[4];
UWORD upright[4];
UWORD downleft[4];
UWORD downright[4];
upleft[0] = *(src + 0);
upleft[1] = *(src + 1);
upleft[2] = *(src + 2);
upleft[3] = *(src + 3);
upright[0] = *(src + 4);
upright[1] = *(src + 5);
upright[2] = *(src + 6);
upright[3] = *(src + 7);
downleft[0] = *(src + pixWidth*8 + 0);
downleft[1] = *(src + pixWidth*8 + 1);
downleft[2] = *(src + pixWidth*8 + 2);
downleft[3] = *(src + pixWidth*8 + 3);
downright[0] = *(src + pixWidth*8 + 4);
downright[1] = *(src + pixWidth*8 + 5);
downright[2] = *(src + pixWidth*8 + 6);
downright[3] = *(src + pixWidth*8 + 7);
UWORD answer[4];
answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2;
answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2;
answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2;
answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2;
answer[0] /= 4;
answer[1] /= 4;
answer[2] /= 4;
answer[3] /= 4;
*(dest + 0) = answer[0];
*(dest + 1) = answer[1];
*(dest + 2) = answer[2];
*(dest + 3) = answer[3];
src += 8;
dest += 4;
}
src += 8*pixWidth;
}
#endif
}
else
{ // NEAREST-NEIGHBOUR but with border preserving
ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL;
#if (defined USE_PORTABLE_C)
PIX offset = 0;
ulRowModulo /= 4;
for (int q = 0; q < 2; q++)
{
for (PIX i = pixHeight / 2; i > 0; i--)
{
for (PIX j = pixWidth / 2; j > 0; j--)
{
*pulDstMipmap = *(pulSrcMipmap + offset);
pulSrcMipmap += 2;
pulDstMipmap++;
}
for (PIX j = pixWidth / 2; j > 0; j--)
{
*pulDstMipmap = *(pulSrcMipmap + offset + 1);
pulSrcMipmap += 2;
pulDstMipmap++;
}
pulSrcMipmap += ulRowModulo;
}
offset = pixWidth * 2;
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
xor ebx,ebx
mov esi,D [pulSrcMipmap]
@ -493,7 +462,33 @@ fullEnd:
);
#else
#error Write inline asm for your platform.
PIX offset = 0;
ulRowModulo /= 4;
for (int q = 0; q < 2; q++)
{
for (PIX i = pixHeight / 2; i > 0; i--)
{
for (PIX j = pixWidth / 2; j > 0; j--)
{
*pulDstMipmap = *(pulSrcMipmap + offset);
pulSrcMipmap += 2;
pulDstMipmap++;
}
for (PIX j = pixWidth / 2; j > 0; j--)
{
*pulDstMipmap = *(pulSrcMipmap + offset + 1);
pulSrcMipmap += 2;
pulDstMipmap++;
}
pulSrcMipmap += ulRowModulo;
}
offset = pixWidth * 2;
}
#endif
}
}
@ -649,7 +644,7 @@ __int64 mmShifter = 0;
__int64 mmMask = 0;
ULONG *pulDitherTable;
#ifdef USE_PORTABLE_C
#if !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
extern const UBYTE *pubClipByte;
// increment a byte without overflowing it
static inline void IncrementByteWithClip( UBYTE &ub, SLONG slAdd)
@ -778,35 +773,7 @@ void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth
// ------------------------------- ordered matrix dithering routine
ditherOrder:
#if (defined USE_PORTABLE_C)
union uConv
{
ULONG val;
DWORD dwords[2];
UWORD words[4];
WORD iwords[4];
UBYTE bytes[8];
};
for (int i=0; i<pixHeight; i++) {
int idx = i&3;
uConv dith;
dith.val = pulDitherTable[idx];
for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
dith.val &= mmMask;
uConv* src = (uConv*)(pulSrc+i*pixWidth);
uConv* dst = (uConv*)(pulDst+i*pixWidth);
for (int j=0; j<pixWidth; j+=2) {
uConv p=src[0];
for (int k=0; k<8; k++) {
IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
}
dst[0] = p;
src++;
dst++;
}
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [pulSrc]
mov edi,D [pulDst]
@ -912,7 +879,33 @@ nextRowO:
);
#else
#error Write inline asm for your platform.
union uConv
{
ULONG val;
DWORD dwords[2];
UWORD words[4];
WORD iwords[4];
UBYTE bytes[8];
};
for (int i=0; i<pixHeight; i++) {
int idx = i&3;
uConv dith;
dith.val = pulDitherTable[idx];
for (int j=0; j<4; j++) { dith.words[j] >>= mmShifter; }
dith.val &= mmMask;
uConv* src = (uConv*)(pulSrc+i*pixWidth);
uConv* dst = (uConv*)(pulDst+i*pixWidth);
for (int j=0; j<pixWidth; j+=2) {
uConv p=src[0];
for (int k=0; k<8; k++) {
IncrementByteWithClip(p.bytes[k], dith.bytes[k]);
}
dst[0] = p;
src++;
dst++;
}
}
#endif
goto theEnd;
@ -924,34 +917,7 @@ ditherError:
if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
// slModulo+=4;
// now, dither destination
#if (defined USE_PORTABLE_C)
#if 1 //SEB doesn't works....
for (int i=0; i<pixHeight-1; i++) {
int step = (i&1)?-4:+4;
const UBYTE ubMask = (mmErrDiffMask&0xff);
UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
if(i&1) src+=pixWidth*4;
// left to right or right to left
for (int j=0; j<pixWidth-1; j++) {
uConv p1, p3, p5, p7;
src+=step;
for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
//p1.val &= mmErrDiffMask;
for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
p5.words[k] = (p1.words[k]*5)>>4;
p7.words[k] = (p1.words[k]*7)>>4; }
for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
for (int k=0; k<4; k++) {
IncrementByteWithClip( src[k + step] , p7.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
}
}
}
#endif
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
pxor mm0,mm0
mov esi,D [pulDst]
@ -1157,7 +1123,32 @@ allDoneE:
);
#else
#error Write inline asm for your platform.
#if 1 //SEB doesn't works....
for (int i=0; i<pixHeight-1; i++) {
int step = (i&1)?-4:+4;
const UBYTE ubMask = (mmErrDiffMask&0xff);
UBYTE *src = ((UBYTE*)pulDst)+i*pixCanvasWidth*4;
if(i&1) src+=pixWidth*4;
// left to right or right to left
for (int j=0; j<pixWidth-1; j++) {
uConv p1, p3, p5, p7;
src+=step;
for (int k=0; k<4; k++) { p1.words[k] = src[k]&ubMask; }
//p1.val &= mmErrDiffMask;
for (int k=0; k<4; k++) { p3.words[k] = (p1.words[k]*3)>>4;
p5.words[k] = (p1.words[k]*5)>>4;
p7.words[k] = (p1.words[k]*7)>>4; }
for (int k=0; k<4; k++) { p1.words[k] -= (p3.words[k] + p5.words[k] + p7.words[k]);}
for (int k=0; k<4; k++) {
IncrementByteWithClip( src[k + step] , p7.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 -step +k], p5.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 +0 +k], p3.words[k]);
IncrementByteWithClip( src[pixCanvasWidth*4 +step +k], p1.words[k]);
}
}
}
#endif
#endif
goto theEnd;
@ -1265,7 +1256,7 @@ extern "C" {
}
#ifdef USE_PORTABLE_C
#if !(defined USE_MMX_INTRINSICS) && !(defined __MSVC_INLINE__) && !(defined __GNU_INLINE_X86_32__)
typedef SWORD ExtPix[4];
static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
@ -1632,265 +1623,6 @@ void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PI
_mm_empty(); // we're done, clear out the MMX registers!
#elif (defined USE_PORTABLE_C)
slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
ULONG *src = pulSrc;
ULONG *dst = pulDst;
ULONG *rowptr = aulRows;
ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
#define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
EXTPIXFROMINT64(mmCm);
EXTPIXFROMINT64(mmCe);
EXTPIXFROMINT64(mmCc);
EXTPIXFROMINT64(mmEch);
EXTPIXFROMINT64(mmEcl);
EXTPIXFROMINT64(mmEe);
EXTPIXFROMINT64(mmEm);
EXTPIXFROMINT64(mmMm);
EXTPIXFROMINT64(mmMe);
EXTPIXFROMINT64(mmMc);
EXTPIXFROMINT64(mmAdd);
EXTPIXFROMINT64(mmInvDiv);
#undef EXTPIXFROMINT64
// ----------------------- process upper left corner
extend_pixel(src[0], rmm1);
extend_pixel(src[1], rmm2);
extend_pixel(src[pixCanvasWidth], rmm3);
extend_pixel(src[pixCanvasWidth+1], rmm4);
extpix_add(rmm2, rmm3);
extpix_mul(rmm1, rmmCm);
extpix_mul(rmm2, rmmCe);
extpix_mul(rmm4, rmmCc);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*(rowptr++) = unextend_pixel(rmm1);
src++;
// ----------------------- process upper edge pixels
for (PIX i = pixWidth - 2; i != 0; i--)
{
extend_pixel(src[-1], rmm1);
extend_pixel(src[0], rmm2);
extend_pixel(src[1], rmm3);
extend_pixel(src[pixCanvasWidth-1], rmm4);
extend_pixel(src[pixCanvasWidth], rmm5);
extend_pixel(src[pixCanvasWidth+1], rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm4, rmm6);
extpix_mul(rmm1, rmmEch);
extpix_mul(rmm2, rmmEm);
extpix_mul(rmm4, rmmEcl);
extpix_mul(rmm5, rmmEe);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_add(rmm1, rmm5);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*(rowptr++) = unextend_pixel(rmm1);
src++;
}
// ----------------------- process upper right corner
extend_pixel(src[-1], rmm1);
extend_pixel(src[0], rmm2);
extend_pixel(src[pixCanvasWidth-1], rmm3);
extend_pixel(src[pixCanvasWidth], rmm4);
extpix_add(rmm1, rmm4);
extpix_mul(rmm1, rmmCe);
extpix_mul(rmm2, rmmCm);
extpix_mul(rmm3, rmmCc);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*rowptr = unextend_pixel(rmm1);
// ----------------------- process bitmap middle pixels
dst += slCanvasWidth;
src += slModulo1;
// for each row
for (size_t i = pixHeight-2; i != 0; i--) // rowLoop
{
rowptr = aulRows;
// process left edge pixel
extend_pixel(src[-pixCanvasWidth], rmm1);
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
extend_pixel(src[0], rmm3);
extend_pixel(src[1], rmm4);
extend_pixel(src[pixCanvasWidth], rmm5);
extend_pixel(src[pixCanvasWidth+1], rmm6);
extpix_add(rmm1, rmm5);
extpix_add(rmm2, rmm6);
extpix_mul(rmm1, rmmEch);
extpix_mul(rmm2, rmmEcl);
extpix_mul(rmm3, rmmEm);
extpix_mul(rmm4, rmmEe);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*(rowptr++) = unextend_pixel(rmm1);
src++;
dst++;
// for each pixel in current row
for (size_t j = pixWidth-2; j != 0; j--) // pixLoop
{
// prepare upper convolution row
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
// prepare middle convolution row
extend_pixel(src[-1], rmm4);
extend_pixel(src[0], rmm5);
extend_pixel(src[1], rmm6);
// free some registers
extpix_add(rmm1, rmm3);
extpix_add(rmm2, rmm4);
extpix_mul(rmm5, rmmMm);
// prepare lower convolution row
extend_pixel(src[pixCanvasWidth-1], rmm3);
extend_pixel(src[pixCanvasWidth], rmm4);
extend_pixel(src[pixCanvasWidth+1], rmm7);
// calc weightened value
extpix_add(rmm2, rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm2, rmm4);
extpix_add(rmm1, rmm7);
extpix_mul(rmm2, rmmMe);
extpix_mul(rmm1, rmmMc);
extpix_add(rmm2, rmm5);
extpix_add(rmm1, rmm2);
// calc and store wightened value
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*(rowptr++) = unextend_pixel(rmm1);
// advance to next pixel
src++;
dst++;
}
// process right edge pixel
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[-1], rmm3);
extend_pixel(src[0], rmm4);
extend_pixel(src[pixCanvasWidth-1], rmm5);
extend_pixel(src[pixCanvasWidth], rmm6);
extpix_add(rmm1, rmm5);
extpix_add(rmm2, rmm6);
extpix_mul(rmm1, rmmEcl);
extpix_mul(rmm2, rmmEch);
extpix_mul(rmm3, rmmEe);
extpix_mul(rmm4, rmmEm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*rowptr = unextend_pixel(rmm1);
// advance to next row
src += slModulo1;
dst += slModulo1;
}
// ----------------------- process lower left corner
rowptr = aulRows;
extend_pixel(src[-pixCanvasWidth], rmm1);
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
extend_pixel(src[0], rmm3);
extend_pixel(src[1], rmm4);
extpix_add(rmm1, rmm4);
extpix_mul(rmm1, rmmCe);
extpix_mul(rmm2, rmmCc);
extpix_mul(rmm3, rmmCm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
src++;
dst++;
rowptr++;
// ----------------------- process lower edge pixels
for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop
{
// for each pixel
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
extend_pixel(src[-1], rmm4);
extend_pixel(src[0], rmm5);
extend_pixel(src[1], rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm4, rmm6);
extpix_mul(rmm1, rmmEcl);
extpix_mul(rmm2, rmmEe);
extpix_mul(rmm4, rmmEch);
extpix_mul(rmm5, rmmEm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_add(rmm1, rmm5);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
// advance to next pixel
src++;
dst++;
rowptr++;
}
// ----------------------- lower right corners
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[-1], rmm3);
extend_pixel(src[0], rmm4);
extpix_add(rmm2, rmm3);
extpix_mul(rmm1, rmmCc);
extpix_mul(rmm2, rmmCe);
extpix_mul(rmm4, rmmCm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
#elif (defined __MSVC_INLINE__)
__asm {
cld
@ -2537,7 +2269,264 @@ lowerLoop:
);
#else
#error Write inline asm for your platform.
slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
ULONG *src = pulSrc;
ULONG *dst = pulDst;
ULONG *rowptr = aulRows;
ExtPix rmm1={0}, rmm2={0}, rmm3={0}, rmm4={0}, rmm5={0}, rmm6={0}, rmm7={0};
#define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
EXTPIXFROMINT64(mmCm);
EXTPIXFROMINT64(mmCe);
EXTPIXFROMINT64(mmCc);
EXTPIXFROMINT64(mmEch);
EXTPIXFROMINT64(mmEcl);
EXTPIXFROMINT64(mmEe);
EXTPIXFROMINT64(mmEm);
EXTPIXFROMINT64(mmMm);
EXTPIXFROMINT64(mmMe);
EXTPIXFROMINT64(mmMc);
EXTPIXFROMINT64(mmAdd);
EXTPIXFROMINT64(mmInvDiv);
#undef EXTPIXFROMINT64
// ----------------------- process upper left corner
extend_pixel(src[0], rmm1);
extend_pixel(src[1], rmm2);
extend_pixel(src[pixCanvasWidth], rmm3);
extend_pixel(src[pixCanvasWidth+1], rmm4);
extpix_add(rmm2, rmm3);
extpix_mul(rmm1, rmmCm);
extpix_mul(rmm2, rmmCe);
extpix_mul(rmm4, rmmCc);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*(rowptr++) = unextend_pixel(rmm1);
src++;
// ----------------------- process upper edge pixels
for (PIX i = pixWidth - 2; i != 0; i--)
{
extend_pixel(src[-1], rmm1);
extend_pixel(src[0], rmm2);
extend_pixel(src[1], rmm3);
extend_pixel(src[pixCanvasWidth-1], rmm4);
extend_pixel(src[pixCanvasWidth], rmm5);
extend_pixel(src[pixCanvasWidth+1], rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm4, rmm6);
extpix_mul(rmm1, rmmEch);
extpix_mul(rmm2, rmmEm);
extpix_mul(rmm4, rmmEcl);
extpix_mul(rmm5, rmmEe);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_add(rmm1, rmm5);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*(rowptr++) = unextend_pixel(rmm1);
src++;
}
// ----------------------- process upper right corner
extend_pixel(src[-1], rmm1);
extend_pixel(src[0], rmm2);
extend_pixel(src[pixCanvasWidth-1], rmm3);
extend_pixel(src[pixCanvasWidth], rmm4);
extpix_add(rmm1, rmm4);
extpix_mul(rmm1, rmmCe);
extpix_mul(rmm2, rmmCm);
extpix_mul(rmm3, rmmCc);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
*rowptr = unextend_pixel(rmm1);
// ----------------------- process bitmap middle pixels
dst += slCanvasWidth;
src += slModulo1;
// for each row
for (size_t i = pixHeight-2; i != 0; i--) // rowLoop
{
rowptr = aulRows;
// process left edge pixel
extend_pixel(src[-pixCanvasWidth], rmm1);
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
extend_pixel(src[0], rmm3);
extend_pixel(src[1], rmm4);
extend_pixel(src[pixCanvasWidth], rmm5);
extend_pixel(src[pixCanvasWidth+1], rmm6);
extpix_add(rmm1, rmm5);
extpix_add(rmm2, rmm6);
extpix_mul(rmm1, rmmEch);
extpix_mul(rmm2, rmmEcl);
extpix_mul(rmm3, rmmEm);
extpix_mul(rmm4, rmmEe);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*(rowptr++) = unextend_pixel(rmm1);
src++;
dst++;
// for each pixel in current row
for (size_t j = pixWidth-2; j != 0; j--) // pixLoop
{
// prepare upper convolution row
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
// prepare middle convolution row
extend_pixel(src[-1], rmm4);
extend_pixel(src[0], rmm5);
extend_pixel(src[1], rmm6);
// free some registers
extpix_add(rmm1, rmm3);
extpix_add(rmm2, rmm4);
extpix_mul(rmm5, rmmMm);
// prepare lower convolution row
extend_pixel(src[pixCanvasWidth-1], rmm3);
extend_pixel(src[pixCanvasWidth], rmm4);
extend_pixel(src[pixCanvasWidth+1], rmm7);
// calc weightened value
extpix_add(rmm2, rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm2, rmm4);
extpix_add(rmm1, rmm7);
extpix_mul(rmm2, rmmMe);
extpix_mul(rmm1, rmmMc);
extpix_add(rmm2, rmm5);
extpix_add(rmm1, rmm2);
// calc and store wightened value
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*(rowptr++) = unextend_pixel(rmm1);
// advance to next pixel
src++;
dst++;
}
// process right edge pixel
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[-1], rmm3);
extend_pixel(src[0], rmm4);
extend_pixel(src[pixCanvasWidth-1], rmm5);
extend_pixel(src[pixCanvasWidth], rmm6);
extpix_add(rmm1, rmm5);
extpix_add(rmm2, rmm6);
extpix_mul(rmm1, rmmEcl);
extpix_mul(rmm2, rmmEch);
extpix_mul(rmm3, rmmEe);
extpix_mul(rmm4, rmmEm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
*rowptr = unextend_pixel(rmm1);
// advance to next row
src += slModulo1;
dst += slModulo1;
}
// ----------------------- process lower left corner
rowptr = aulRows;
extend_pixel(src[-pixCanvasWidth], rmm1);
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
extend_pixel(src[0], rmm3);
extend_pixel(src[1], rmm4);
extpix_add(rmm1, rmm4);
extpix_mul(rmm1, rmmCe);
extpix_mul(rmm2, rmmCc);
extpix_mul(rmm3, rmmCm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm3);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
src++;
dst++;
rowptr++;
// ----------------------- process lower edge pixels
for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop
{
// for each pixel
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
extend_pixel(src[-1], rmm4);
extend_pixel(src[0], rmm5);
extend_pixel(src[1], rmm6);
extpix_add(rmm1, rmm3);
extpix_add(rmm4, rmm6);
extpix_mul(rmm1, rmmEcl);
extpix_mul(rmm2, rmmEe);
extpix_mul(rmm4, rmmEch);
extpix_mul(rmm5, rmmEm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_add(rmm1, rmm5);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
// advance to next pixel
src++;
dst++;
rowptr++;
}
// ----------------------- lower right corners
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
extend_pixel(src[-pixCanvasWidth], rmm2);
extend_pixel(src[-1], rmm3);
extend_pixel(src[0], rmm4);
extpix_add(rmm2, rmm3);
extpix_mul(rmm1, rmmCc);
extpix_mul(rmm2, rmmCe);
extpix_mul(rmm4, rmmCm);
extpix_add(rmm1, rmm2);
extpix_add(rmm1, rmm4);
extpix_adds(rmm1, rmmAdd);
extpix_mulhi(rmm1, rmmInvDiv);
dst[-pixCanvasWidth] = *rowptr;
dst[0] = unextend_pixel(rmm1);
#endif
// all done (finally)

View File

@ -89,13 +89,7 @@ extern void (__stdcall *pglPNTrianglesfATI)( GLenum pname, GLfloat param);
inline void glCOLOR( COLOR col)
{
/* rcg10052001 Platform-wrappers. */
#if (defined USE_PORTABLE_C)
col = ( ((col << 24) ) |
((col << 8) & 0x00FF0000) |
((col >> 8) & 0x0000FF00) |
((col >> 24) ) );
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
mov eax,dword ptr [col]
bswap eax
@ -110,7 +104,11 @@ inline void glCOLOR( COLOR col)
);
#else
#error please define for your platform.
col = ( ((col << 24) ) |
((col << 8) & 0x00FF0000) |
((col >> 8) & 0x0000FF00) |
((col >> 24) ) );
#endif
pglColor4ubv((GLubyte*)&col);

View File

@ -32,9 +32,7 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define W word ptr
#define B byte ptr
#if (defined USE_PORTABLE_C)
#define ASMOPT 0
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
#define ASMOPT 1
#elif (defined __GNU_INLINE_X86_32__)
#define ASMOPT 1
@ -1285,7 +1283,6 @@ static void RenderWater(void)
{ // SUB-SAMPLING
SLONG slHeightMapStep, slHeightRowStep;
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
push ebx
@ -1460,10 +1457,6 @@ pixLoop:
"cc", "memory"
);
#else
#error fill in for your platform.
#endif
#else
PIX pixPos, pixDU, pixDV;

View File

@ -40,16 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define W word ptr
#define B byte ptr
#if (defined USE_PORTABLE_C)
#define ASMOPT 0
#elif (defined __MSVC_INLINE__)
#define ASMOPT 1
#elif (defined __GNU_INLINE_X86_32__)
#define ASMOPT 1
#else
#define ASMOPT 0
#endif
extern INDEX shd_bFineQuality;
extern INDEX shd_iFiltering;
extern INDEX shd_iDithering;
@ -290,7 +280,6 @@ void CLayerMixer::AddAmbientPoint(void)
_slLightMax<<=7;
_slLightStep>>=1;
#if (ASMOPT == 1)
#if (defined __MSVC_INLINE__)
__asm {
// prepare interpolants
@ -439,10 +428,6 @@ skipPixel:
: FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
);
#else
#error Write inline asm for your platform.
#endif
#else
// !!! FIXME WARNING: I have not checked this code, and it could be
@ -496,7 +481,6 @@ void CLayerMixer::AddAmbientMaskPoint( UBYTE *pubMask, UBYTE ubMask)
_slLightStep>>=1;
#if (ASMOPT == 1)
#if (defined __MSVC_INLINE__)
__asm {
// prepare interpolants
@ -660,10 +644,6 @@ skipPixel:
"cc", "memory"
);
#else
#error Please write inline assembly for your platform.
#endif
#else // Portable C version...
UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -723,7 +703,6 @@ void CLayerMixer::AddDiffusionPoint(void)
_slLightMax<<=7;
_slLightStep>>=1;
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
// prepare interpolants
@ -871,10 +850,6 @@ skipPixel:
: FPU_REGS, MMX_REGS, "eax", "ecx", "edi", "cc", "memory"
);
#else
#error Write inline assembly for your platform.
#endif
#else
// for each pixel in the shadow map
UBYTE* pubLayer = (UBYTE*)_pulLayer;
@ -929,7 +904,6 @@ void CLayerMixer::AddDiffusionMaskPoint( UBYTE *pubMask, UBYTE ubMask)
_slLightMax<<=7;
_slLightStep>>=1;
#if (ASMOPT == 1)
#if (defined __MSVC_INLINE__)
__asm {
// prepare interpolants
@ -1091,11 +1065,6 @@ skipPixel:
"cc", "memory"
);
#else
#error Write inline ASM for your platform.
#endif
#else
// for each pixel in the shadow map
@ -1201,7 +1170,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
FLOAT fDL2oDV = fDDL2oDV + 2*(lm_vStepV%v00);
//_v00 = v00;
#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__))
#if (defined __MSVC_INLINE__)
__asm {
fld D [fDDL2oDU]
@ -1230,12 +1198,6 @@ BOOL CLayerMixer::PrepareOneLayerPoint( CBrushShadowLayer *pbsl, BOOL bNoMask)
fistp D [_slDDL2oDV]
fistp D [_slDDL2oDU]
}
#elif (defined __GNU_INLINE_X86_32__)
STUBBED("inline asm.");
#else
#error Please write inline assembly for your platform.
#endif
#else
fDDL2oDU *= 2;
fDDL2oDV *= 2;
@ -1321,7 +1283,6 @@ void CLayerMixer::AddOneLayerGradient( CGradientParameters &gp)
_pulLayer = lm_pulShadowMap;
FLOAT fStart = Clamp( fGr00-(fDGroDJ+fDGroDI)*0.5f, 0.0f, 1.0f);
#if ((ASMOPT == 1) && (!defined __GNU_INLINE_X86_32__))
#if (defined __MSVC_INLINE__)
__int64 mmRowAdv;
SLONG fixGRow = (fGr00-(fDGroDJ+fDGroDI)*0.5f)*32767.0f; // 16:15
@ -1436,14 +1397,6 @@ rowNext:
rowDone:
emms
}
#elif (defined __GNU_INLINE_X86_32__)
STUBBED("WRITE ME. Argh.");
#else
#error Need inline assembly for your platform.
#endif
#else
// well, make gradient ...
SLONG slR0=0,slG0=0,slB0=0;
@ -1528,9 +1481,8 @@ rowDone:
// apply directional light or ambient to layer
void CLayerMixer::AddDirectional(void)
{
#if ASMOPT == 1
ULONG ulLight = ByteSwap( lm_colLight);
#if (defined __MSVC_INLINE__)
ULONG ulLight = ByteSwap( lm_colLight);
__asm {
// prepare pointers and variables
mov edi,D [_pulLayer]
@ -1566,6 +1518,7 @@ rowNext:
}
#elif (defined __GNU_INLINE_X86_32__)
ULONG ulLight = ByteSwap( lm_colLight);
ULONG tmp;
__asm__ __volatile__ (
// prepare pointers and variables
@ -1608,10 +1561,6 @@ rowNext:
: FPU_REGS, "mm5", "mm6", "ecx", "edi", "cc", "memory"
);
#else
#error Write inline assembly for your platform.
#endif
#else
UBYTE* pubLayer = (UBYTE*)_pulLayer;
// for each pixel in the shadow map
@ -1631,9 +1580,8 @@ rowNext:
// apply directional light thru mask to layer
void CLayerMixer::AddMaskDirectional( UBYTE *pubMask, UBYTE ubMask)
{
#if ASMOPT == 1
ULONG ulLight = ByteSwap( lm_colLight);
#if (defined __MSVC_INLINE__)
ULONG ulLight = ByteSwap( lm_colLight);
// prepare some local variables
__asm {
// prepare pointers and variables
@ -1666,6 +1614,7 @@ skipLight:
}
#elif (defined __GNU_INLINE_X86_32__)
ULONG ulLight = ByteSwap( lm_colLight);
ULONG tmp;
__asm__ __volatile__ (
// prepare pointers and variables
@ -1706,10 +1655,6 @@ skipLight:
"cc", "memory"
);
#else
#error Please write inline assembly for your platform.
#endif
#else
UBYTE* pubLayer = (UBYTE*)_pulLayer;
// for each pixel in the shadow map
@ -1832,25 +1777,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
}
} // set initial color
#if (defined USE_PORTABLE_C)
register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV;
#if PLATFORM_LITTLEENDIAN
// Forces C fallback; BYTESWAP itself is a no-op on little endian.
register ULONG swapped = BYTESWAP32_unsigned(colAmbient);
#else
STUBBED("actually need byteswap?");
// (uses inline asm on MacOS PowerPC)
register ULONG swapped = colAmbient;
BYTESWAP(swapped);
#endif
for (ULONG *ptr = this->lm_pulShadowMap; count; count--)
{
*ptr = swapped;
ptr++;
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov ebx,D [this]
@ -1877,7 +1804,23 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
);
#else
#error Please write inline assembly for your platform.
register ULONG count = this->lm_pixCanvasSizeU * this->lm_pixCanvasSizeV;
#if PLATFORM_LITTLEENDIAN
// Forces C fallback; BYTESWAP itself is a no-op on little endian.
register ULONG swapped = BYTESWAP32_unsigned(colAmbient);
#else
STUBBED("actually need byteswap?");
// (uses inline asm on MacOS PowerPC)
register ULONG swapped = colAmbient;
BYTESWAP(swapped);
#endif
for (ULONG *ptr = this->lm_pulShadowMap; count; count--)
{
*ptr = swapped;
ptr++;
}
#endif
_pfWorldEditingProfile.StopTimer(CWorldEditingProfile::PTI_AMBIENTFILL);
@ -1955,9 +1898,7 @@ void CLayerMixer::MixOneMipmap(CBrushShadowMap *pbsm, INDEX iMipmap)
// copy from static shadow map to dynamic layer
__forceinline void CLayerMixer::CopyShadowLayer(void)
{
#if (defined USE_PORTABLE_C)
memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov ebx,D [this]
@ -1981,7 +1922,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
);
#else
#error Please write inline assembly for your platform.
memcpy(lm_pulShadowMap, lm_pulStaticShadowMap, lm_pixCanvasSizeU*lm_pixCanvasSizeV*4);
#endif
}
@ -1989,12 +1930,7 @@ __forceinline void CLayerMixer::CopyShadowLayer(void)
// copy from static shadow map to dynamic layer
__forceinline void CLayerMixer::FillShadowLayer( COLOR col)
{
#if (defined USE_PORTABLE_C)
DWORD* dst = (DWORD*)lm_pulShadowMap;
int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;
DWORD color = __builtin_bswap32(col);
while(n--) {*(dst++)=color;}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov ebx,D [this]
@ -2021,7 +1957,10 @@ __forceinline void CLayerMixer::FillShadowLayer( COLOR col)
);
#else
#error Please write inline assembly for your platform.
DWORD* dst = (DWORD*)lm_pulShadowMap;
int n = lm_pixCanvasSizeU*lm_pixCanvasSizeV;
DWORD color = __builtin_bswap32(col);
while(n--) {*(dst++)=color;}
#endif
}

View File

@ -24,18 +24,9 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define _PC_64 0x0300
// !!! FIXME: I'd like to remove any dependency on the FPU control word from the game, asap. --ryan.
#ifdef USE_PORTABLE_C
// Fake control87 for USE_PORTABLE_C version
inline ULONG _control87(WORD newcw, WORD mask)
{
static WORD fpw=_PC_64;
if (mask != 0)
{
fpw &= ~mask;
fpw |= (newcw & mask);
}
return(fpw);
}
#if (defined _MSC_VER)
// _control87 is provided by the compiler
#elif (defined __GNU_INLINE_X86_32__)
@ -74,8 +65,20 @@ inline ULONG _control87(WORD newcw, WORD mask)
return(fpw);
}
#elif (!defined _MSC_VER)
#error Implement for your platform, or add a stub conditional here.
#else
// Fake control87 for USE_PORTABLE_C version
inline ULONG _control87(WORD newcw, WORD mask)
{
static WORD fpw=_PC_64;
if (mask != 0)
{
fpw &= ~mask;
fpw |= (newcw & mask);
}
return(fpw);
}
#endif
/* Get current precision setting of FPU. */

View File

@ -312,12 +312,7 @@ inline FLOAT NormByteToFloat( const ULONG ul)
// fast float to int conversion
inline SLONG FloatToInt( FLOAT f)
{
#if defined(__arm__) || defined(USE_PORTABLE_C)
// round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
return((SLONG) (f + addToRound));
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
SLONG slRet;
__asm {
fld D [f]
@ -336,16 +331,16 @@ inline SLONG FloatToInt( FLOAT f)
);
return(slRet);
#else
#error Fill this in for your platform.
// round to nearest by adding/subtracting 0.5 (depending on f pos/neg) before converting to SLONG
float addToRound = copysignf(0.5f, f); // copy f's signbit to 0.5 => if f<0 then addToRound = -0.5, else 0.5
return((SLONG) (f + addToRound));
#endif
}
// log base 2 of any float numero
inline FLOAT Log2( FLOAT f) {
#if (defined USE_PORTABLE_C) || defined(__arm__)
return log2f(f);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
FLOAT fRet;
_asm {
fld1
@ -368,7 +363,8 @@ inline FLOAT Log2( FLOAT f) {
);
return(fRet);
#else
#error Fill this in for your platform.
return log2f(f);
#endif
}
@ -376,25 +372,7 @@ inline FLOAT Log2( FLOAT f) {
// returns accurate values only for integers that are power of 2
inline SLONG FastLog2( SLONG x)
{
#if (defined USE_PORTABLE_C)
#ifdef __GNUC__
if(x == 0) return 0; // __builtin_clz() is undefined for 0
int numLeadingZeros = __builtin_clz(x);
return 31 - numLeadingZeros;
#else
register SLONG val = x;
register SLONG retval = 31;
while (retval > 0)
{
if (val & (1 << retval))
return retval;
retval--;
}
return 0;
#endif
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
SLONG slRet;
__asm {
bsr eax,D [x]
@ -411,8 +389,21 @@ inline SLONG FastLog2( SLONG x)
: "memory"
);
return(slRet);
#elif (defined __GNUC__)
if(x == 0) return 0; // __builtin_clz() is undefined for 0
int numLeadingZeros = __builtin_clz(x);
return 31 - numLeadingZeros;
#else
#error Fill this in for your platform.
register SLONG val = x;
register SLONG retval = 31;
while (retval > 0)
{
if (val & (1 << retval))
return retval;
retval--;
}
return 0;
#endif
}
@ -420,11 +411,7 @@ inline SLONG FastLog2( SLONG x)
// returns log2 of first larger value that is a power of 2
inline SLONG FastMaxLog2( SLONG x)
{
#if (defined USE_PORTABLE_C)
printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
return((SLONG) log2((double) x));
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
SLONG slRet;
__asm {
bsr eax,D [x]
@ -448,7 +435,9 @@ printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
);
return(slRet);
#else
#error Fill this in for your platform.
printf("CHECK THIS: %s:%d\n", __FILE__, __LINE__);
return((SLONG) log2((double) x));
#endif
}
*/

View File

@ -40,14 +40,6 @@ with this program; if not, write to the Free Software Foundation, Inc.,
#define W word ptr
#define B byte ptr
#if (defined __MSVC_INLINE__)
#define ASMOPT 1
#elif (defined __GNU_INLINE_X86_32__)
#define ASMOPT 0 // !!! FIXME: rcg10112001 Write GCC inline asm versions...
#else
#define ASMOPT 0
#endif
extern BOOL CVA_bModels;
extern BOOL GFX_bTruform;
@ -663,7 +655,7 @@ static FLOAT _fHazeAdd;
// check vertex against fog
static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [vtx]
mov edi,D [tex]
@ -708,7 +700,7 @@ static void GetFogMapInVertex( GFXVertex3 &vtx, GFXTexCoord &tex)
// check vertex against haze
static void GetHazeMapInVertex( GFXVertex3 &vtx, FLOAT &tx1)
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [vtx]
mov edi,D [tx1]
@ -1080,7 +1072,7 @@ static void UnpackFrame( CRenderModel &rm, BOOL bKeepNormals)
const ModelFrameVertex16 *pFrame1 = rm.rm_pFrame16_1;
if( pFrame0==pFrame1)
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
// for each vertex in mip
const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
SLONG slTmp1, slTmp2, slTmp3;
@ -1196,7 +1188,7 @@ vtxNext16:
// if lerping
else
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
// for each vertex in mip
const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
SLONG slTmp1, slTmp2, slTmp3;
@ -1365,7 +1357,7 @@ vtxNext16L:
// if no lerping
if( pFrame0==pFrame1)
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
// for each vertex in mip
const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
SLONG slTmp1, slTmp2, slTmp3;
@ -1464,7 +1456,7 @@ vtxNext8:
// if lerping
else
{
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
const SLONG fixLerpRatio = FloatToInt(fLerpRatio*256.0f); // fix 8:8
SLONG slTmp1, slTmp2, slTmp3;
// re-adjust stretching factors because of fixint lerping (divide by 256)
@ -1610,7 +1602,7 @@ vtxNext8L:
}
// generate colors from shades
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
pxor mm0,mm0
// construct 64-bit RGBA light
@ -1974,7 +1966,7 @@ void CModelObject::RenderModel_View( CRenderModel &rm)
pvtxSrfBase = &_avtxSrfBase[iSrfVx0];
INDEX iSrfVx;
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
push ebx
mov ebx,D [puwSrfToMip]
@ -2074,7 +2066,7 @@ srfVtxLoop:
const COLOR colD = AdjustColor( ms.ms_colDiffuse, _slTexHueShift, _slTexSaturation);
colSrfDiff.MultiplyRGBA( colD, colMdlDiff);
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
// setup texcoord array
__asm {
push ebx
@ -2134,7 +2126,7 @@ vtxEnd:
for( INDEX iSrfVx=0; iSrfVx<ctSrfVx; iSrfVx++) pcolSrfBase[iSrfVx] = colSrfDiffAdj;
}
else {
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
// setup color array
const COLOR colS = colSrfDiff.ul.abgr;
__asm {
@ -2335,7 +2327,7 @@ diffColLoop:
// cache rotation
const FLOATmatrix3D &m = rm.rm_mObjectRotation;
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
push ebx
mov ebx,D [m]
@ -2530,7 +2522,7 @@ reflMipLoop:
// cache object view rotation
const FLOATmatrix3D &m = rm.rm_mObjectToView;
#if ASMOPT == 1
#if (defined __MSVC_INLINE__)
__asm {
push ebx
mov ebx,D [m]

View File

@ -105,10 +105,7 @@ static SLONG slTmp;
static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
{
#if (defined USE_PORTABLE_C)
return((PIX) (f+0.9999f));
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
PIX pixRet;
__asm {
fld dword ptr [f]
@ -142,7 +139,8 @@ static inline PIX PIXCoord(FLOAT f) // (f+0.9999f) or (ceil(f))
return pixRet;
#else
#error Please write inline ASM for your platform.
return((PIX) (f+0.9999f));
#endif
}

View File

@ -43,17 +43,15 @@ static CSoundData *psd;
// nasm on MacOS X is getting wrong addresses of external globals, so I have
// to define them in the .asm file...lame.
#ifdef __GNU_INLINE_X86_32__
#ifdef USE_PORTABLE_C
#define INASM
#else
#if (defined __GNU_INLINE_X86_32__)
#define INASM extern
#endif
#else
#elif (defined __MSVC_INLINE__)
#define INASM static
static __int64 mmInvFactor = 0x00007FFF00007FFF;
static FLOAT f65536 = 65536.0f;
static FLOAT f4G = 4294967296.0f;
#else
#define INASM static
#endif
INASM SLONG slMixerBufferSize; // size in samples per channel of the destination buffers
@ -81,11 +79,7 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
slMixerBufferSampleRate = _pSound->sl_SwfeFormat.nSamplesPerSec;
// wipe destination mixer buffer
// (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
xor eax,eax
@ -94,19 +88,8 @@ void ResetMixer( const SLONG *pslBuffer, const SLONG slBufferSize)
shl ecx,1 // *2 because of 32-bit src format
rep stosd
}
#elif (defined __GNU_INLINE_X86_32__)
// !!! FIXME : rcg12172001 Is this REALLY any faster than memset()?
ULONG clob1, clob2;
__asm__ __volatile__ (
"cld \n\t"
"rep \n\t"
"stosl \n\t"
: "=D" (clob1), "=c" (clob2)
: "a" (0), "D" (pvMixerBuffer), "c" (slMixerBufferSize*2)
: "cc", "memory"
);
#else
#error please write inline asm for your platform.
memset(pvMixerBuffer, 0, slMixerBufferSize * 8);
#endif
}
@ -118,10 +101,7 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
ASSERT( slBytes%4==0);
if( slBytes<4) return;
#if ((defined USE_PORTABLE_C) || (PLATFORM_MACOSX))
// (Mac OS X uses this path because Apple's memset() is customized for each CPU they support and way faster than this inline asm. --ryan.)
memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov esi,D [slSrcOffset]
@ -131,21 +111,8 @@ void CopyMixerBuffer_stereo( const SLONG slSrcOffset, void *pDstBuffer, const SL
shr ecx,2 // bytes to samples per channel
rep movsd
}
#elif (defined __GNU_INLINE_X86_32__)
// !!! FIXME : rcg12172001 Is this REALLY any faster than memcpy()?
ULONG clob1, clob2, clob3;
__asm__ __volatile__ (
"cld \n\t"
"rep \n\t"
"movsl \n\t"
: "=S" (clob1), "=D" (clob2), "=c" (clob3)
: "S" (((char *)pvMixerBuffer) + slSrcOffset),
"D" (pDstBuffer),
"c" (slBytes >> 2)
: "cc", "memory"
);
#else
#error please write inline asm for your platform.
memcpy(pDstBuffer, ((const char *)pvMixerBuffer) + slSrcOffset, slBytes);
#endif
}
@ -157,18 +124,7 @@ void CopyMixerBuffer_mono( const SLONG slSrcOffset, void *pDstBuffer, const SLON
ASSERT( slBytes%2==0);
if( slBytes<4) return;
#if (defined USE_PORTABLE_C)
// (This is untested, currently. --ryan.)
WORD *dest = (WORD *) pDstBuffer;
WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
SLONG max = slBytes / 4;
for (SLONG i = 0; i < max; i++) {
*dest = *src;
dest++; // move 16 bits.
src+=2; // move 32 bits.
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
mov esi,D [slSrcOffset]
add esi,D [pvMixerBuffer]
@ -204,7 +160,15 @@ copyLoop:
);
#else
#error please write inline asm for your platform.
// (This is untested, currently. --ryan.)
WORD *dest = (WORD *) pDstBuffer;
WORD *src = (WORD *) ( ((char *) pvMixerBuffer) + slSrcOffset );
SLONG max = slBytes / 4;
for (SLONG i = 0; i < max; i++) {
*dest = *src;
dest++; // move 16 bits.
src+=2; // move 32 bits.
}
#endif
}
@ -215,24 +179,7 @@ static void ConvertMixerBuffer( const SLONG slBytes)
ASSERT( slBytes%4==0);
if( slBytes<4) return;
#if (defined USE_PORTABLE_C)
//STUBBED("ConvertMixerBuffer");
SWORD *dest = (SWORD *) pvMixerBuffer;
SLONG *src = (SLONG *) pvMixerBuffer;
SLONG max = slBytes / 2;
int tmp;
for (SLONG i = 0; i < max; i++) {
tmp = *src;
if (tmp>32767) tmp=32767;
if (tmp<-32767) tmp=-32767;
*dest=tmp;
dest++; // move 16 bits.
src++; // move 32 bits.
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
cld
mov esi,D [pvMixerBuffer]
@ -271,7 +218,20 @@ copyLoop:
);
#else
#error please write inline asm for your platform.
SWORD *dest = (SWORD *) pvMixerBuffer;
SLONG *src = (SLONG *) pvMixerBuffer;
SLONG max = slBytes / 2;
int tmp;
for (SLONG i = 0; i < max; i++) {
tmp = *src;
if (tmp>32767) tmp=32767;
if (tmp<-32767) tmp=-32767;
*dest=tmp;
dest++; // move 16 bits.
src++; // move 32 bits.
}
#endif
}
@ -337,85 +297,7 @@ inline void MixMono( CSoundObject *pso)
{
_pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
#if (defined USE_PORTABLE_C)
// initialize some local vars
SLONG slLeftSample, slRightSample, slNextSample;
SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
fixLeftOfs = (__int64)(fLeftOfs * 65536.0);
fixRightOfs = (__int64)(fRightOfs * 65536.0);
__int64 fixLeftStep = (__int64)(fLeftStep * 65536.0);
__int64 fixRightStep = (__int64)(fRightStep * 65536.0);
__int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
SLONG slLeftVolume_ = slLeftVolume >> 16;
SLONG slRightVolume_ = slRightVolume >> 16;
// loop thru source buffer
INDEX iCt = slMixerBufferSize;
FOREVER
{
// if left channel source sample came to end of sample buffer
if( fixLeftOfs >= fixSoundBufferSize) {
fixLeftOfs -= fixSoundBufferSize;
// if has no loop, end it
bEndOfSound = bNotLoop;
}
// if right channel source sample came to end of sample buffer
if( fixRightOfs >= fixSoundBufferSize) {
fixRightOfs -= fixSoundBufferSize;
// if has no loop, end it
bEndOfSound = bNotLoop;
}
// end of buffer?
if( iCt<=0 || bEndOfSound) break;
// fetch one lineary interpolated sample on left channel
slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
// fetch one lineary interpolated sample on right channel
slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1];
slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
// filter samples
slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
// apply stereo volume to current sample
slLeftSample = (slLastLeftSample * slLeftVolume_) >>15;
slRightSample = (slLastRightSample * slRightVolume_)>>15;
slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
// mix in current sample
slLeftSample += pslDstBuffer[0];
slRightSample += pslDstBuffer[1];
// upper clamp
if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD;
if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
// lower clamp
if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD;
if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
// store samples (both channels)
pslDstBuffer[0] = slLeftSample;
pslDstBuffer[1] = slRightSample;
// modify volume `
slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
// advance to next sample
fixLeftOfs += fixLeftStep;
fixRightOfs += fixRightStep;
pslDstBuffer += 2;
iCt--;
}
#elif (defined __MSVC_INLINE__)
#if (defined __MSVC_INLINE__)
__asm {
// convert from floats to fixints 32:16
fld D [fLeftOfs]
@ -553,19 +435,6 @@ loopEnd:
MixMono_asm(pso);
#else
#error please write inline asm for your platform.
#endif
_pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
}
// mixes one stereo 16-bit signed sound to destination buffer
inline void MixStereo( CSoundObject *pso)
{
_pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
#if (defined USE_PORTABLE_C)
// initialize some local vars
SLONG slLeftSample, slRightSample, slNextSample;
SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
@ -599,12 +468,12 @@ inline void MixStereo( CSoundObject *pso)
if( iCt<=0 || bEndOfSound) break;
// fetch one lineary interpolated sample on left channel
slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
slLeftSample = pswSrcBuffer[(fixLeftOfs>>16)+0];
slNextSample = pswSrcBuffer[(fixLeftOfs>>16)+1];
slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
// fetch one lineary interpolated sample on right channel
slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2];
slRightSample = pswSrcBuffer[(fixRightOfs>>16)+0];
slNextSample = pswSrcBuffer[(fixRightOfs>>16)+1];
slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
// filter samples
@ -643,7 +512,18 @@ inline void MixStereo( CSoundObject *pso)
iCt--;
}
#elif (defined __MSVC_INLINE__)
#endif
_pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);
}
// mixes one stereo 16-bit signed sound to destination buffer
inline void MixStereo( CSoundObject *pso)
{
_pfSoundProfile.StartTimer(CSoundProfile::PTI_RAWMIXER);
#if (defined __MSVC_INLINE__)
__asm {
// convert from floats to fixints 32:16
fld D [fLeftOfs]
@ -783,7 +663,83 @@ loopEnd:
MixStereo_asm(pso);
#else
#error please write inline asm for your platform.
// initialize some local vars
SLONG slLeftSample, slRightSample, slNextSample;
SLONG *pslDstBuffer = (SLONG*)pvMixerBuffer;
fixLeftOfs = (__int64)(fLeftOfs * 65536.0);
fixRightOfs = (__int64)(fRightOfs * 65536.0);
__int64 fixLeftStep = (__int64)(fLeftStep * 65536.0);
__int64 fixRightStep = (__int64)(fRightStep * 65536.0);
__int64 fixSoundBufferSize = ((__int64)slSoundBufferSize)<<16;
mmSurroundFactor = (__int64)(SWORD)mmSurroundFactor;
SLONG slLeftVolume_ = slLeftVolume >> 16;
SLONG slRightVolume_ = slRightVolume >> 16;
// loop thru source buffer
INDEX iCt = slMixerBufferSize;
FOREVER
{
// if left channel source sample came to end of sample buffer
if( fixLeftOfs >= fixSoundBufferSize) {
fixLeftOfs -= fixSoundBufferSize;
// if has no loop, end it
bEndOfSound = bNotLoop;
}
// if right channel source sample came to end of sample buffer
if( fixRightOfs >= fixSoundBufferSize) {
fixRightOfs -= fixSoundBufferSize;
// if has no loop, end it
bEndOfSound = bNotLoop;
}
// end of buffer?
if( iCt<=0 || bEndOfSound) break;
// fetch one lineary interpolated sample on left channel
slLeftSample = pswSrcBuffer[(fixLeftOfs>>15)+0];
slNextSample = pswSrcBuffer[(fixLeftOfs>>15)+2];
slLeftSample = (slLeftSample*(65535-(fixLeftOfs&65535)) + slNextSample*(fixLeftOfs&65535)) >>16;
// fetch one lineary interpolated sample on right channel
slRightSample = pswSrcBuffer[(fixRightOfs>>15)+0];
slNextSample = pswSrcBuffer[(fixRightOfs>>15)+2];
slRightSample = (slRightSample*(65535-(fixRightOfs&65535)) + slNextSample*(fixRightOfs&65535)) >>16;
// filter samples
slLastLeftSample += ((slLeftSample -slLastLeftSample) *slLeftFilter) >>15;
slLastRightSample += ((slRightSample-slLastRightSample)*slRightFilter)>>15;
// apply stereo volume to current sample
slLeftSample = (slLastLeftSample * slLeftVolume_) >>15;
slRightSample = (slLastRightSample * slRightVolume_)>>15;
slLeftSample ^= (SLONG)((mmSurroundFactor>> 0)&0xFFFFFFFF);
slRightSample ^= (SLONG)((mmSurroundFactor>>32)&0xFFFFFFFF);
// mix in current sample
slLeftSample += pslDstBuffer[0];
slRightSample += pslDstBuffer[1];
// upper clamp
if( slLeftSample > MAX_SWORD) slLeftSample = MAX_SWORD;
if( slRightSample > MAX_SWORD) slRightSample = MAX_SWORD;
// lower clamp
if( slLeftSample < MIN_SWORD) slLeftSample = MIN_SWORD;
if( slRightSample < MIN_SWORD) slRightSample = MIN_SWORD;
// store samples (both channels)
pslDstBuffer[0] = slLeftSample;
pslDstBuffer[1] = slRightSample;
// modify volume `
slLeftVolume += (SWORD)((mmVolumeGain>> 0)&0xFFFF);
slRightVolume += (SWORD)((mmVolumeGain>>16)&0xFFFF);
// advance to next sample
fixLeftOfs += fixLeftStep;
fixRightOfs += fixRightStep;
pslDstBuffer += 2;
iCt--;
}
#endif
_pfSoundProfile.StopTimer(CSoundProfile::PTI_RAWMIXER);