mirror of
https://github.com/ptitSeb/Serious-Engine
synced 2024-12-27 07:54:51 +01:00
24cb244d43
This was a _ton_ of changes, made 15 years ago, so there are probably some problems to work out still. Among others: Engine/Base/Stream.* was mostly abandoned and will need to be re-ported. Still, this is a pretty good start, and probably holds a world record for lines of changes or something. :)
3041 lines
100 KiB
C++
3041 lines
100 KiB
C++
/* Copyright (c) 2002-2012 Croteam Ltd. All rights reserved. */
|
|
|
|
// !!! FIXME: One of the GNU inline asm blocks has a bug that causes the
|
|
// !!! FIXME: title on the main menu to render incorrectly. (Generating an
|
|
// !!! FIXME: incorrect mipmap?) The intel compiler works fine with the
|
|
// !!! FIXME: MSVC inline asm, but GCC and Intel both have the problem when
|
|
// !!! FIXME: using the GNU inline asm.
|
|
|
|
#include "Engine/StdH.h"
|
|
|
|
#include <Engine/Base/Statistics_Internal.h>
|
|
#include <Engine/Graphics/GfxLibrary.h>
|
|
#include <Engine/Graphics/RenderPoly.h>
|
|
#include <Engine/Graphics/Color.h>
|
|
#include <Engine/Graphics/Texture.h>
|
|
#include <Engine/Graphics/GfxProfile.h>
|
|
|
|
#if USE_MMX_INTRINSICS
|
|
#include <mmintrin.h>
|
|
#endif
|
|
|
|
// asm shortcuts
|
|
#define O offset
|
|
#define Q qword ptr
|
|
#define D dword ptr
|
|
#define W word ptr
|
|
#define B byte ptr
|
|
|
|
extern INDEX tex_bProgressiveFilter; // filter mipmaps in creation time (not afterwards)
|
|
|
|
|
|
// returns number of mip-maps to skip from original texture
|
|
INDEX ClampTextureSize( PIX pixClampSize, PIX pixClampDimension, PIX pixSizeU, PIX pixSizeV)
|
|
{
|
|
__int64 pixMaxSize = (__int64)pixSizeU * (__int64)pixSizeV;
|
|
PIX pixMaxDimension = Max( pixSizeU, pixSizeV);
|
|
INDEX ctSkipMips = 0;
|
|
while( (pixMaxSize>pixClampSize || pixMaxDimension>pixClampDimension) && pixMaxDimension>1) {
|
|
ctSkipMips++;
|
|
pixMaxDimension >>=1;
|
|
pixMaxSize >>=2;
|
|
}
|
|
return ctSkipMips;
|
|
}
|
|
|
|
|
|
// retrives memory offset of a specified mip-map or a size of all mip-maps (IN PIXELS!)
|
|
// (zero offset means first, i.e. largest mip-map)
|
|
PIX GetMipmapOffset( INDEX iMipLevel, PIX pixWidth, PIX pixHeight)
|
|
{
|
|
PIX pixTexSize = 0;
|
|
PIX pixMipSize = pixWidth*pixHeight;
|
|
INDEX iMips = GetNoOfMipmaps( pixWidth, pixHeight);
|
|
iMips = Min( iMips, iMipLevel);
|
|
while( iMips>0) {
|
|
pixTexSize +=pixMipSize;
|
|
pixMipSize>>=2;
|
|
iMips--;
|
|
}
|
|
return pixTexSize;
|
|
}
|
|
|
|
|
|
// return offset, pointer and dimensions of mipmap of specified size inside texture or shadowmap mipmaps
|
|
INDEX GetMipmapOfSize( PIX pixWantedSize, ULONG *&pulFrame, PIX &pixWidth, PIX &pixHeight)
|
|
{
|
|
INDEX iMipOffset = 0;
|
|
while( pixWidth>1 && pixHeight>1) {
|
|
const PIX pixCurrentSize = pixWidth*pixHeight;
|
|
if( pixCurrentSize <= pixWantedSize) break; // found
|
|
pulFrame += pixCurrentSize;
|
|
pixWidth >>=1;
|
|
pixHeight>>=1;
|
|
iMipOffset++;
|
|
} // done
|
|
return iMipOffset;
|
|
}
|
|
|
|
|
|
// adds 8-bit opaque alpha channel to 24-bit bitmap (in place supported)
|
|
void AddAlphaChannel( UBYTE *pubSrcBitmap, ULONG *pulDstBitmap, PIX pixSize, UBYTE *pubAlphaBitmap)
|
|
{
|
|
UBYTE ubR,ubG,ubB, ubA=255;
|
|
// loop backwards thru all bitmap pixels
|
|
for( INDEX iPix=(pixSize-1); iPix>=0; iPix--) {
|
|
ubR = pubSrcBitmap[iPix*3 +0];
|
|
ubG = pubSrcBitmap[iPix*3 +1];
|
|
ubB = pubSrcBitmap[iPix*3 +2];
|
|
if( pubAlphaBitmap!=NULL) ubA = pubAlphaBitmap[iPix];
|
|
else ubA = 255; // for the sake of forced RGBA internal formats!
|
|
pulDstBitmap[iPix] = ByteSwap( RGBAToColor( ubR,ubG,ubB, ubA));
|
|
}
|
|
}
|
|
|
|
// removes 8-bit alpha channel from 32-bit bitmap (in place supported)
|
|
void RemoveAlphaChannel( ULONG *pulSrcBitmap, UBYTE *pubDstBitmap, PIX pixSize)
|
|
{
|
|
UBYTE ubR,ubG,ubB;
|
|
// loop thru all bitmap pixels
|
|
for( INDEX iPix=0; iPix<pixSize; iPix++) {
|
|
ColorToRGB( ByteSwap( pulSrcBitmap[iPix]), ubR,ubG,ubB);
|
|
pubDstBitmap[iPix*3 +0] = ubR;
|
|
pubDstBitmap[iPix*3 +1] = ubG;
|
|
pubDstBitmap[iPix*3 +2] = ubB;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// flips 24 or 32-bit bitmap (iType: 1-horizontal, 2-vertical, 3-diagonal) - in place supported
|
|
void FlipBitmap( UBYTE *pubSrc, UBYTE *pubDst, PIX pixWidth, PIX pixHeight, INDEX iFlipType, BOOL bAlphaChannel)
|
|
{
|
|
// safety
|
|
ASSERT( iFlipType>=0 && iFlipType<4);
|
|
// no flipping ?
|
|
PIX pixSize = pixWidth*pixHeight;
|
|
if( iFlipType==0) {
|
|
// copy bitmap only if needed
|
|
INDEX ctBPP = (bAlphaChannel ? 4 : 3);
|
|
if( pubSrc!=pubDst) memcpy( pubDst, pubSrc, pixSize*ctBPP);
|
|
return;
|
|
}
|
|
|
|
// prepare images without alpha channels
|
|
ULONG *pulNew = NULL;
|
|
ULONG *pulNewSrc = (ULONG*)pubSrc;
|
|
ULONG *pulNewDst = (ULONG*)pubDst;
|
|
if( !bAlphaChannel) {
|
|
pulNew = (ULONG*)AllocMemory( pixSize *BYTES_PER_TEXEL);
|
|
AddAlphaChannel( pubSrc, pulNew, pixSize);
|
|
pulNewSrc = pulNew;
|
|
pulNewDst = pulNew;
|
|
}
|
|
|
|
// prepare half-width and half-height rounded
|
|
const PIX pixHalfWidth = (pixWidth+1) /2;
|
|
const PIX pixHalfHeight = (pixHeight+1)/2;
|
|
|
|
// flip horizontal
|
|
if( iFlipType==2 || iFlipType==3)
|
|
{ // for each row
|
|
for( INDEX iRow=0; iRow<pixHeight; iRow++)
|
|
{ // find row pointer
|
|
PIX pixRowOffset = iRow*pixWidth;
|
|
// for each pixel in row
|
|
for( INDEX iPix=0; iPix<pixHalfWidth; iPix++)
|
|
{ // transfer pixels
|
|
PIX pixBeg = pulNewSrc[pixRowOffset+iPix];
|
|
PIX pixEnd = pulNewSrc[pixRowOffset+(pixWidth-1-iPix)];
|
|
pulNewDst[pixRowOffset+iPix] = pixEnd;
|
|
pulNewDst[pixRowOffset+(pixWidth-1-iPix)] = pixBeg;
|
|
}
|
|
}
|
|
}
|
|
|
|
// prepare new pointers
|
|
if( iFlipType==3) pulNewSrc = pulNewDst;
|
|
|
|
// flip vertical/diagonal
|
|
if( iFlipType==1 || iFlipType==3)
|
|
{ // for each row
|
|
for( INDEX iRow=0; iRow<pixHalfHeight; iRow++)
|
|
{ // find row pointers
|
|
PIX pixBegOffset = iRow*pixWidth;
|
|
PIX pixEndOffset = (pixHeight-1-iRow)*pixWidth;
|
|
// for each pixel in row
|
|
for( INDEX iPix=0; iPix<pixWidth; iPix++)
|
|
{ // transfer pixels
|
|
PIX pixBeg = pulNewSrc[pixBegOffset+iPix];
|
|
PIX pixEnd = pulNewSrc[pixEndOffset+iPix];
|
|
pulNewDst[pixBegOffset+iPix] = pixEnd;
|
|
pulNewDst[pixEndOffset+iPix] = pixBeg;
|
|
}
|
|
}
|
|
}
|
|
|
|
// postpare images without alpha channels
|
|
if( !bAlphaChannel) {
|
|
RemoveAlphaChannel( pulNewDst, pubDst, pixSize);
|
|
if( pulNew!=NULL) FreeMemory(pulNew);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// makes one level lower mipmap (bilinear or nearest-neighbour with border preservance)
|
|
#if (defined __GNUC__)
|
|
static __int64 mmRounder = 0x0002000200020002ll;
|
|
#else
|
|
static __int64 mmRounder = 0x0002000200020002;
|
|
#endif
|
|
|
|
static void MakeOneMipmap( ULONG *pulSrcMipmap, ULONG *pulDstMipmap, PIX pixWidth, PIX pixHeight, BOOL bBilinear)
|
|
{
|
|
// some safety checks
|
|
ASSERT( pixWidth>1 && pixHeight>1);
|
|
ASSERT( pixWidth == 1L<<FastLog2(pixWidth));
|
|
ASSERT( pixHeight == 1L<<FastLog2(pixHeight));
|
|
pixWidth >>=1;
|
|
pixHeight>>=1;
|
|
|
|
if( bBilinear) // type of filtering?
|
|
{ // BILINEAR
|
|
|
|
#if (defined USE_PORTABLE_C)
|
|
UBYTE *src = (UBYTE *) pulSrcMipmap;
|
|
UBYTE *dest = (UBYTE *) pulDstMipmap;
|
|
for (int i = 0 ; i < pixHeight; i++)
|
|
{
|
|
for (int j = 0; j < pixWidth; j++)
|
|
{
|
|
// Grab pixels from image
|
|
UWORD upleft[4];
|
|
UWORD upright[4];
|
|
UWORD downleft[4];
|
|
UWORD downright[4];
|
|
upleft[0] = *(src + 0);
|
|
upleft[1] = *(src + 1);
|
|
upleft[2] = *(src + 2);
|
|
upleft[3] = *(src + 3);
|
|
upright[0] = *(src + 4);
|
|
upright[1] = *(src + 5);
|
|
upright[2] = *(src + 6);
|
|
upright[3] = *(src + 7);
|
|
|
|
downleft[0] = *(src + pixWidth*8 + 0);
|
|
downleft[1] = *(src + pixWidth*8 + 1);
|
|
downleft[2] = *(src + pixWidth*8 + 2);
|
|
downleft[3] = *(src + pixWidth*8 + 3);
|
|
downright[0] = *(src + pixWidth*8 + 4);
|
|
downright[1] = *(src + pixWidth*8 + 5);
|
|
downright[2] = *(src + pixWidth*8 + 6);
|
|
downright[3] = *(src + pixWidth*8 + 7);
|
|
|
|
UWORD answer[4];
|
|
answer[0] = upleft[0] + upright[0] + downleft[0] + downright[0] + 2;
|
|
answer[1] = upleft[1] + upright[1] + downleft[1] + downright[1] + 2;
|
|
answer[2] = upleft[2] + upright[2] + downleft[2] + downright[2] + 2;
|
|
answer[3] = upleft[3] + upright[3] + downleft[3] + downright[3] + 2;
|
|
answer[0] /= 4;
|
|
answer[1] /= 4;
|
|
answer[2] /= 4;
|
|
answer[3] /= 4;
|
|
|
|
*(dest + 0) = answer[0];
|
|
*(dest + 1) = answer[1];
|
|
*(dest + 2) = answer[2];
|
|
*(dest + 3) = answer[3];
|
|
|
|
src += 8;
|
|
dest += 4;
|
|
}
|
|
src += 8*pixWidth;
|
|
}
|
|
|
|
#elif (defined __MSVC_INLINE__)
|
|
__asm {
|
|
pxor mm0,mm0
|
|
mov ebx,D [pixWidth]
|
|
mov esi,D [pulSrcMipmap]
|
|
mov edi,D [pulDstMipmap]
|
|
mov edx,D [pixHeight]
|
|
rowLoop:
|
|
mov ecx,D [pixWidth]
|
|
pixLoopN:
|
|
movd mm1,D [esi+ 0] // up-left
|
|
movd mm2,D [esi+ 4] // up-right
|
|
movd mm3,D [esi+ ebx*8 +0] // down-left
|
|
movd mm4,D [esi+ ebx*8 +4] // down-right
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
paddw mm1,mm2
|
|
paddw mm1,mm3
|
|
paddw mm1,mm4
|
|
paddw mm1,Q [mmRounder]
|
|
psrlw mm1,2
|
|
packuswb mm1,mm0
|
|
movd D [edi],mm1
|
|
// advance to next pixel
|
|
add esi,4*2
|
|
add edi,4
|
|
dec ecx
|
|
jnz pixLoopN
|
|
// advance to next row
|
|
lea esi,[esi+ ebx*8] // skip one row in source mip-map
|
|
dec edx
|
|
jnz rowLoop
|
|
emms
|
|
}
|
|
|
|
#elif (defined __GNU_INLINE__)
|
|
__asm__ __volatile__ (
|
|
"pushl %%ebx \n\t" // Save GCC's register.
|
|
"movl %%ecx, %%ebx \n\t"
|
|
|
|
"pxor %%mm0, %%mm0 \n\t"
|
|
|
|
"0: \n\t" // rowLoop
|
|
"movl %%ebx, %%ecx \n\t"
|
|
|
|
"1: \n\t" // pixLoopN
|
|
"movd 0(%%esi), %%mm1 \n\t" // up-left
|
|
"movd 4(%%esi), %%mm2 \n\t" // up-right
|
|
"movd 0(%%esi, %%ebx, 8), %%mm3 \n\t" // down-left
|
|
"movd 4(%%esi, %%ebx, 8), %%mm4 \n\t" // down-right
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddw (%%eax), %%mm1 \n\t"
|
|
"psrlw $2, %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd %%mm1, (%%edi) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $8, %%esi \n\t"
|
|
"addl $4, %%edi \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz 1b \n\t" // pixLoopN
|
|
|
|
// advance to next row
|
|
// skip one row in source mip-map
|
|
"leal 0(%%esi, %%ebx, 8), %%esi \n\t"
|
|
"decl %%edx \n\t"
|
|
"jnz 0b \n\t" // rowLoop
|
|
"popl %%ebx \n\t" // restore GCC's register.
|
|
"emms \n\t"
|
|
: // no outputs.
|
|
: "a" (&mmRounder), "c" (pixWidth), "S" (pulSrcMipmap),
|
|
"D" (pulDstMipmap), "d" (pixHeight)
|
|
: "cc", "memory"
|
|
);
|
|
|
|
#else
|
|
#error Write inline asm for your platform.
|
|
#endif
|
|
}
|
|
else
|
|
{ // NEAREST-NEIGHBOUR but with border preserving
|
|
ULONG ulRowModulo = pixWidth*2 *BYTES_PER_TEXEL;
|
|
|
|
#if (defined USE_PORTABLE_C)
|
|
|
|
PIX offset = 0;
|
|
ulRowModulo /= 4;
|
|
|
|
for (int q = 0; q < 2; q++)
|
|
{
|
|
for (PIX i = pixHeight / 2; i > 0; i--)
|
|
{
|
|
for (PIX j = pixWidth / 2; j > 0; j--)
|
|
{
|
|
*pulDstMipmap = *(pulSrcMipmap + offset);
|
|
pulSrcMipmap += 2;
|
|
pulDstMipmap++;
|
|
}
|
|
|
|
for (PIX j = pixWidth / 2; j > 0; j--)
|
|
{
|
|
*pulDstMipmap = *(pulSrcMipmap + offset + 1);
|
|
pulSrcMipmap += 2;
|
|
pulDstMipmap++;
|
|
}
|
|
|
|
pulSrcMipmap += ulRowModulo;
|
|
}
|
|
|
|
offset = pixWidth * 2;
|
|
}
|
|
|
|
#elif (defined __MSVC_INLINE__)
|
|
__asm {
|
|
xor ebx,ebx
|
|
mov esi,D [pulSrcMipmap]
|
|
mov edi,D [pulDstMipmap]
|
|
// setup upper half
|
|
mov edx,D [pixHeight]
|
|
shr edx,1
|
|
halfLoop:
|
|
mov ecx,D [pixWidth]
|
|
shr ecx,1
|
|
leftLoop:
|
|
mov eax,D [esi+ ebx*8+ 0] // upper-left (or lower-left)
|
|
mov D [edi],eax
|
|
// advance to next pixel
|
|
add esi,4*2
|
|
add edi,4
|
|
sub ecx,1
|
|
jg leftLoop
|
|
// do right row half
|
|
mov ecx,D [pixWidth]
|
|
shr ecx,1
|
|
jz halfEnd
|
|
rightLoop:
|
|
mov eax,D [esi+ ebx*8+ 4] // upper-right (or lower-right)
|
|
mov D [edi],eax
|
|
// advance to next pixel
|
|
add esi,4*2
|
|
add edi,4
|
|
sub ecx,1
|
|
jg rightLoop
|
|
halfEnd:
|
|
// advance to next row
|
|
add esi,D [ulRowModulo] // skip one row in source mip-map
|
|
sub edx,1
|
|
jg halfLoop
|
|
// do eventual lower half loop (if not yet done)
|
|
mov edx,D [pixHeight]
|
|
shr edx,1
|
|
jz fullEnd
|
|
cmp ebx,D [pixWidth]
|
|
mov ebx,D [pixWidth]
|
|
jne halfLoop
|
|
fullEnd:
|
|
}
|
|
|
|
#elif (defined __GNU_INLINE__)
|
|
__asm__ __volatile__ (
|
|
"pushl %%ebx \n\t" // Save GCC's register.
|
|
"movl %%ecx, %%ebx \n\t"
|
|
|
|
// setup upper half
|
|
"pushl %%edx \n\t" // pixHeight
|
|
"pushl %%eax \n\t" // ulRowModulo
|
|
"pushl %%ebx \n\t" // pixWidth
|
|
"xorl %%ebx, %%ebx \n\t"
|
|
"shrl $1, %%edx \n\t"
|
|
|
|
"0: \n\t" // halfLoop
|
|
"movl (%%esp), %%ecx \n\t"
|
|
"shrl $1, %%ecx \n\t"
|
|
|
|
"1: \n\t" // leftLoop
|
|
"movl 0(%%esi, %%ebx, 8), %%eax \n\t" // upper-left (or lower-left)
|
|
"movl %%eax, (%%edi) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $8, %%esi \n\t"
|
|
"addl $4, %%edi \n\t"
|
|
"subl $1, %%ecx \n\t"
|
|
"jg 1b \n\t" // leftLoop
|
|
|
|
// do right row half
|
|
"movl (%%esp), %%ecx \n\t"
|
|
"shrl $1, %%ecx \n\t"
|
|
"jz 3f \n\t" // halfEnd
|
|
|
|
"2: \n\t" // rightLoop
|
|
"movl 4(%%esi, %%ebx, 8), %%eax \n\t" // upper-right (or lower-right)
|
|
"movl %%eax, (%%edi) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $8, %%esi \n\t"
|
|
"addl $4, %%edi \n\t"
|
|
"subl $1, %%ecx \n\t"
|
|
"jg 2b \n\t" // rightLoop
|
|
|
|
"3: \n\t" // halfEnd
|
|
// advance to next row
|
|
"addl 4(%%esp), %%esi \n\t" // skip one row in source mip-map
|
|
"subl $1, %%edx \n\t"
|
|
"jg 0b \n\t" // halfLoop
|
|
|
|
// do eventual lower half loop (if not yet done)
|
|
"movl 8(%%esp), %%edx \n\t"
|
|
"shrl $1, %%edx \n\t"
|
|
"jz 4f \n\t" // fullEnd
|
|
"cmpl (%%esp), %%ebx \n\t"
|
|
"movl (%%esp), %%ebx \n\t"
|
|
"jne 0b \n\t" // halfLoop
|
|
|
|
"4: \n\t" // fullEnd
|
|
"addl $12, %%esp \n\t"
|
|
"popl %%ebx \n\t" // restore GCC's register.
|
|
: // no outputs.
|
|
: "S" (pulSrcMipmap), "D" (pulDstMipmap), "d" (pixHeight),
|
|
"c" (pixWidth), "a" (ulRowModulo)
|
|
: "cc", "memory"
|
|
);
|
|
|
|
#else
|
|
#error Write inline asm for your platform.
|
|
#endif
|
|
}
|
|
}
|
|
|
|
|
|
// makes ALL lower mipmaps (to size of 1x1!) of a specified 32-bit bitmap
|
|
// and returns pointer to newely created and mipmaped image
|
|
// (only first ctFineMips number of mip-maps will be filtered with bilinear subsampling, while
|
|
// all others will be downsampled with nearest-neighbour method)
|
|
void MakeMipmaps( INDEX ctFineMips, ULONG *pulMipmaps, PIX pixWidth, PIX pixHeight, INDEX iFilter/*=NONE*/)
|
|
{
|
|
ASSERT( pixWidth>0 && pixHeight>0);
|
|
_pfGfxProfile.StartTimer( CGfxProfile::PTI_MAKEMIPMAPS);
|
|
|
|
// prepare some variables
|
|
INDEX ctMipmaps = 1;
|
|
PIX pixTexSize = 0;
|
|
PIX pixCurrWidth = pixWidth;
|
|
PIX pixCurrHeight = pixHeight;
|
|
ULONG *pulSrcMipmap, *pulDstMipmap;
|
|
|
|
// determine filtering mode (-1=prefiltering, 0=none, 1=postfiltering)
|
|
INDEX iFilterMode = 0;
|
|
if( iFilter!=0) {
|
|
iFilterMode = -1;
|
|
if( !tex_bProgressiveFilter) iFilterMode = +1;
|
|
}
|
|
|
|
// loop thru mip-map levels
|
|
while( pixCurrWidth>1 && pixCurrHeight>1)
|
|
{ // determine mip size
|
|
PIX pixMipSize = pixCurrWidth*pixCurrHeight;
|
|
pulSrcMipmap = pulMipmaps + pixTexSize;
|
|
pulDstMipmap = pulSrcMipmap + pixMipSize;
|
|
// do pre filter is required
|
|
if( iFilterMode<0) FilterBitmap( iFilter, pulSrcMipmap, pulSrcMipmap, pixCurrWidth, pixCurrHeight);
|
|
// create one mipmap
|
|
MakeOneMipmap( pulSrcMipmap, pulDstMipmap, pixCurrWidth, pixCurrHeight, ctMipmaps<ctFineMips);
|
|
// do post filter if required
|
|
if( iFilterMode>0) FilterBitmap( iFilter, pulSrcMipmap, pulSrcMipmap, pixCurrWidth, pixCurrHeight);
|
|
// advance to next mipmap
|
|
pixTexSize += pixMipSize;
|
|
pixCurrWidth >>=1;
|
|
pixCurrHeight >>=1;
|
|
ctMipmaps++;
|
|
}
|
|
// all done
|
|
_pfGfxProfile.StopTimer( CGfxProfile::PTI_MAKEMIPMAPS);
|
|
}
|
|
|
|
|
|
// mipmap colorization table (from 1024 to 1)
|
|
static COLOR _acolMips[10] = { C_RED, C_GREEN, C_BLUE, C_CYAN, C_MAGENTA, C_YELLOW, C_RED, C_GREEN, C_BLUE, C_WHITE };
|
|
|
|
// colorize mipmaps
|
|
void ColorizeMipmaps( INDEX i1stMipmapToColorize, ULONG *pulMipmaps, PIX pixWidth, PIX pixHeight)
|
|
{
|
|
// prepare ...
|
|
ULONG *pulSrcMipmap = pulMipmaps + GetMipmapOffset( i1stMipmapToColorize, pixWidth, pixHeight);
|
|
ULONG *pulDstMipmap;
|
|
PIX pixCurrWidth = pixWidth >>i1stMipmapToColorize;
|
|
PIX pixCurrHeight = pixHeight>>i1stMipmapToColorize;
|
|
PIX pixMipSize;
|
|
// skip too large textures
|
|
const PIX pixMaxDim = Max( pixCurrWidth, pixCurrHeight);
|
|
if( pixMaxDim>1024) return;
|
|
INDEX iTableOfs = 10-FastLog2(pixMaxDim);
|
|
|
|
// loop thru mip-map levels
|
|
while( pixCurrWidth>1 && pixCurrHeight>1)
|
|
{ // prepare current mip-level
|
|
pixMipSize = pixCurrWidth*pixCurrHeight;
|
|
pulDstMipmap = pulSrcMipmap + pixMipSize;
|
|
// mask mipmap
|
|
const ULONG ulColorMask = ByteSwap( _acolMips[iTableOfs] | 0x3F3F3FFF);
|
|
for( INDEX iPix=0; iPix<pixMipSize; iPix++) pulSrcMipmap[iPix] &= ulColorMask;
|
|
// advance to next mipmap
|
|
pulSrcMipmap += pixMipSize;
|
|
pixCurrWidth >>=1;
|
|
pixCurrHeight >>=1;
|
|
iTableOfs++;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// calculates standard deviation of a bitmap
|
|
DOUBLE CalcBitmapDeviation( ULONG *pulBitmap, PIX pixSize)
|
|
{
|
|
UBYTE ubR,ubG,ubB;
|
|
ULONG ulSumR =0, ulSumG =0, ulSumB =0;
|
|
__int64 mmSumR2=0, mmSumG2=0, mmSumB2=0;
|
|
|
|
// calculate sum and sum^2
|
|
for( INDEX iPix=0; iPix<pixSize; iPix++) {
|
|
ColorToRGB( ByteSwap(pulBitmap[iPix]), ubR,ubG,ubB);
|
|
ulSumR += ubR; ulSumG += ubG; ulSumB += ubB;
|
|
mmSumR2 += ubR*ubR; mmSumG2 += ubG*ubG; mmSumB2 += ubB*ubB;
|
|
}
|
|
|
|
// calculate deviation of each channel
|
|
DOUBLE d1oSize = 1.0 / (DOUBLE) pixSize;
|
|
DOUBLE d1oSizeM1 = 1.0 / (DOUBLE)(pixSize-1);
|
|
DOUBLE dAvgR = (DOUBLE)ulSumR *d1oSize;
|
|
DOUBLE dAvgG = (DOUBLE)ulSumG *d1oSize;
|
|
DOUBLE dAvgB = (DOUBLE)ulSumB *d1oSize;
|
|
DOUBLE dDevR = Sqrt( ((DOUBLE)mmSumR2 - 2*ulSumR*dAvgR + pixSize*dAvgR*dAvgR) *d1oSizeM1);
|
|
DOUBLE dDevG = Sqrt( ((DOUBLE)mmSumG2 - 2*ulSumG*dAvgG + pixSize*dAvgG*dAvgG) *d1oSizeM1);
|
|
DOUBLE dDevB = Sqrt( ((DOUBLE)mmSumB2 - 2*ulSumB*dAvgB + pixSize*dAvgB*dAvgB) *d1oSizeM1);
|
|
|
|
// return maximum deviation
|
|
return Max( Max( dDevR, dDevG), dDevB);
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// DITHERING ROUTINES
|
|
|
|
// dither tables
|
|
static ULONG ulDither4[4][4] = {
|
|
{ 0x0F0F0F0F, 0x07070707, 0x0D0D0D0D, 0x05050505 },
|
|
{ 0x03030303, 0x0B0B0B0B, 0x01010101, 0x09090909 },
|
|
{ 0x0C0C0C0C, 0x04040404, 0x0E0E0E0E, 0x06060606 },
|
|
{ 0x00000000, 0x08080808, 0x02020202, 0x0A0A0A0A }
|
|
};
|
|
static ULONG ulDither3[4][4] = {
|
|
{ 0x06060606, 0x02020202, 0x06060606, 0x02020202 },
|
|
{ 0x00000000, 0x04040404, 0x00000000, 0x04040404 },
|
|
{ 0x06060606, 0x02020202, 0x06060606, 0x02020202 },
|
|
{ 0x00000000, 0x04040404, 0x00000000, 0x04040404 },
|
|
};
|
|
static ULONG ulDither2[4][4] = {
|
|
{ 0x02020202, 0x06060606, 0x02020202, 0x06060606 },
|
|
{ 0x06060606, 0x02020202, 0x06060606, 0x02020202 },
|
|
{ 0x02020202, 0x06060606, 0x02020202, 0x06060606 },
|
|
{ 0x06060606, 0x02020202, 0x06060606, 0x02020202 },
|
|
};
|
|
|
|
|
|
static __int64 mmErrDiffMask=0;
|
|
#if (defined __GNUC__)
|
|
static __int64 mmW3 = 0x0003000300030003ll;
|
|
static __int64 mmW5 = 0x0005000500050005ll;
|
|
static __int64 mmW7 = 0x0007000700070007ll;
|
|
#else
|
|
static __int64 mmW3 = 0x0003000300030003;
|
|
static __int64 mmW5 = 0x0005000500050005;
|
|
static __int64 mmW7 = 0x0007000700070007;
|
|
#endif
|
|
static __int64 mmShift = 0;
|
|
static __int64 mmMask = 0;
|
|
static ULONG *pulDitherTable;
|
|
|
|
// performs dithering of a 32-bit bipmap (can be in-place)
|
|
void DitherBitmap( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight,
|
|
PIX pixCanvasWidth, PIX pixCanvasHeight)
|
|
{
|
|
_pfGfxProfile.StartTimer( CGfxProfile::PTI_DITHERBITMAP);
|
|
|
|
// determine row modulo
|
|
if( pixCanvasWidth ==0) pixCanvasWidth = pixWidth;
|
|
if( pixCanvasHeight==0) pixCanvasHeight = pixHeight;
|
|
ASSERT( pixCanvasWidth>=pixWidth && pixCanvasHeight>=pixHeight);
|
|
SLONG slModulo = (pixCanvasWidth-pixWidth) *BYTES_PER_TEXEL;
|
|
SLONG slWidthModulo = pixWidth*BYTES_PER_TEXEL +slModulo;
|
|
|
|
// if bitmap is smaller than 4x2 pixels
|
|
if( pixWidth<4 || pixHeight<2)
|
|
{ // don't dither it at all, rather copy only (if needed)
|
|
if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
|
|
goto theEnd;
|
|
}
|
|
|
|
// determine proper dither type
|
|
switch( iDitherType)
|
|
{ // low dithers
|
|
case 1:
|
|
pulDitherTable = &ulDither2[0][0];
|
|
mmShift = 2;
|
|
#ifdef __GNUC__
|
|
mmMask = 0x3F3F3F3F3F3F3F3Fll;
|
|
#else
|
|
mmMask = 0x3F3F3F3F3F3F3F3F;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 2:
|
|
pulDitherTable = &ulDither2[0][0];
|
|
mmShift = 1;
|
|
#ifdef __GNUC__
|
|
mmMask = 0x7F7F7F7F7F7F7F7Fll;
|
|
#else
|
|
mmMask = 0x7F7F7F7F7F7F7F7F;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 3:
|
|
#ifdef __GNUC__
|
|
mmErrDiffMask = 0x0003000300030003ll;
|
|
#else
|
|
mmErrDiffMask = 0x0003000300030003;
|
|
#endif
|
|
goto ditherError;
|
|
// medium dithers
|
|
case 4:
|
|
pulDitherTable = &ulDither2[0][0];
|
|
mmShift = 0;
|
|
#ifdef __GNUC__
|
|
mmMask = 0xFFFFFFFFFFFFFFFFll;
|
|
#else
|
|
mmMask = 0xFFFFFFFFFFFFFFFF;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 5:
|
|
pulDitherTable = &ulDither3[0][0];
|
|
mmShift = 1;
|
|
#ifdef __GNUC__
|
|
mmMask = 0x7F7F7F7F7F7F7F7Fll;
|
|
#else
|
|
mmMask = 0x7F7F7F7F7F7F7F7F;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 6:
|
|
pulDitherTable = &ulDither4[0][0];
|
|
mmShift = 1;
|
|
#ifdef __GNUC__
|
|
mmMask = 0x7F7F7F7F7F7F7F7Fll;
|
|
#else
|
|
mmMask = 0x7F7F7F7F7F7F7F7F;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 7:
|
|
#ifdef __GNUC__
|
|
mmErrDiffMask = 0x0007000700070007ll;
|
|
#else
|
|
mmErrDiffMask = 0x0007000700070007;
|
|
#endif
|
|
goto ditherError;
|
|
// high dithers
|
|
case 8:
|
|
pulDitherTable = &ulDither3[0][0];
|
|
mmShift = 0;
|
|
#ifdef __GNUC__
|
|
mmMask = 0xFFFFFFFFFFFFFFFFll;
|
|
#else
|
|
mmMask = 0xFFFFFFFFFFFFFFFF;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 9:
|
|
pulDitherTable = &ulDither4[0][0];
|
|
mmShift = 0;
|
|
#ifdef __GNUC__
|
|
mmMask = 0xFFFFFFFFFFFFFFFFll;
|
|
#else
|
|
mmMask = 0xFFFFFFFFFFFFFFFF;
|
|
#endif
|
|
goto ditherOrder;
|
|
case 10:
|
|
#ifdef __GNUC__
|
|
mmErrDiffMask = 0x000F000F000F000Fll;
|
|
#else
|
|
mmErrDiffMask = 0x000F000F000F000F;
|
|
#endif
|
|
goto ditherError;
|
|
default:
|
|
// improper dither type
|
|
ASSERTALWAYS( "Improper dithering type.");
|
|
// if bitmap copying is needed
|
|
if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
|
|
goto theEnd;
|
|
}
|
|
|
|
// ------------------------------- ordered matrix dithering routine
|
|
|
|
ditherOrder:
|
|
#if (defined USE_PORTABLE_C)
|
|
STUBBED("ordered matrix dithering routine");
|
|
|
|
#elif (defined __MSVC_INLINE__)
|
|
__asm {
|
|
mov esi,D [pulSrc]
|
|
mov edi,D [pulDst]
|
|
mov ebx,D [pulDitherTable]
|
|
// reset dither line offset
|
|
xor eax,eax
|
|
mov edx,D [pixHeight]
|
|
rowLoopO:
|
|
// get horizontal dither patterns
|
|
movq mm4,Q [ebx+ eax*4 +0]
|
|
movq mm5,Q [ebx+ eax*4 +8]
|
|
psrlw mm4,Q [mmShift]
|
|
psrlw mm5,Q [mmShift]
|
|
pand mm4,Q [mmMask]
|
|
pand mm5,Q [mmMask]
|
|
// process row
|
|
mov ecx,D [pixWidth]
|
|
pixLoopO:
|
|
movq mm1,Q [esi +0]
|
|
movq mm2,Q [esi +8]
|
|
paddusb mm1,mm4
|
|
paddusb mm2,mm5
|
|
movq Q [edi +0],mm1
|
|
movq Q [edi +8],mm2
|
|
// advance to next pixel
|
|
add esi,4*4
|
|
add edi,4*4
|
|
sub ecx,4
|
|
jg pixLoopO // !!!! possible memory leak?
|
|
je nextRowO
|
|
// backup couple of pixels
|
|
lea esi,[esi+ ecx*4]
|
|
lea edi,[edi+ ecx*4]
|
|
nextRowO:
|
|
// get next dither line patterns
|
|
add esi,D [slModulo]
|
|
add edi,D [slModulo]
|
|
add eax,1*4
|
|
and eax,4*4-1
|
|
// advance to next row
|
|
dec edx
|
|
jnz rowLoopO
|
|
emms;
|
|
}
|
|
|
|
#elif (defined __GNU_INLINE__)
|
|
__asm__ __volatile__ (
|
|
// reset dither line offset
|
|
"pushl %%ebx \n\t" // save GCC's register.
|
|
"movl (" ASMSYM(pulDitherTable) "), %%ebx \n\t"
|
|
"pushl %%ecx \n\t" // slModulo
|
|
"pushl %%eax \n\t" // pixWidth
|
|
"xorl %%eax, %%eax \n\t"
|
|
|
|
"rowLoopO: \n\t"
|
|
// get horizontal dither patterns
|
|
"movq 0(%%ebx, %%eax, 4), %%mm4 \n\t"
|
|
"movq 8(%%ebx, %%eax, 4), %%mm5 \n\t"
|
|
"psrlw (" ASMSYM(mmShift) "), %%mm4 \n\t"
|
|
"psrlw (" ASMSYM(mmShift) "), %%mm5 \n\t"
|
|
"pand (" ASMSYM(mmMask) "), %%mm4 \n\t"
|
|
"pand (" ASMSYM(mmMask) "), %%mm5 \n\t"
|
|
|
|
// process row
|
|
"movl (%%esp), %%ecx \n\t"
|
|
"pixLoopO: \n\t"
|
|
"movq 0(%%esi), %%mm1 \n\t"
|
|
"movq 8(%%esi), %%mm2 \n\t"
|
|
"paddusb %%mm4, %%mm1 \n\t"
|
|
"paddusb %%mm5, %%mm2 \n\t"
|
|
"movq %%mm1, 0(%%edi) \n\t"
|
|
"movq %%mm2, 8(%%edi) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $16, %%esi \n\t"
|
|
"addl $16, %%edi \n\t"
|
|
"subl $4, %%ecx \n\t"
|
|
"jg pixLoopO \n\t" // !!!! possible memory leak?
|
|
"je nextRowO \n\t"
|
|
|
|
// backup couple of pixels
|
|
"leal 0(%%esi, %%ecx, 4), %%esi \n\t"
|
|
"leal 0(%%edi, %%ecx, 4), %%edi \n\t"
|
|
|
|
"nextRowO: \n\t"
|
|
// get next dither line patterns
|
|
"addl 4(%%esp), %%esi \n\t"
|
|
"addl 4(%%esp), %%edi \n\t"
|
|
"addl $4, %%eax \n\t"
|
|
"andl $15, %%eax \n\t"
|
|
|
|
// advance to next row
|
|
"decl %%edx \n\t"
|
|
"jnz rowLoopO \n\t"
|
|
"emms \n\t"
|
|
"addl $8, %%esp \n\t"
|
|
"popl %%ebx \n\t" // restore GCC's register.
|
|
: // no outputs.
|
|
: "S" (pulSrc), "D" (pulDst), "d" (pixHeight),
|
|
"a" (pixWidth), "c" (slModulo)
|
|
: "cc", "memory"
|
|
);
|
|
|
|
#else
|
|
#error Write inline asm for your platform.
|
|
#endif
|
|
|
|
goto theEnd;
|
|
|
|
// ------------------------------- error diffusion dithering routine
|
|
|
|
ditherError:
|
|
// since error diffusion algorithm requires in-place dithering, original bitmap must be copied if needed
|
|
if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
|
|
// slModulo+=4;
|
|
// now, dither destination
|
|
#if (defined USE_PORTABLE_C)
|
|
STUBBED("error diffusion dithering routine");
|
|
|
|
#elif (defined __MSVC_INLINE__)
|
|
__asm {
|
|
pxor mm0,mm0
|
|
mov esi,D [pulDst]
|
|
mov ebx,D [pixCanvasWidth]
|
|
mov edx,D [pixHeight]
|
|
dec edx // need not to dither last row
|
|
rowLoopE:
|
|
// left to right
|
|
mov ecx,D [pixWidth]
|
|
dec ecx
|
|
pixLoopEL:
|
|
movd mm1,D [esi]
|
|
punpcklbw mm1,mm0
|
|
pand mm1,Q [mmErrDiffMask]
|
|
// determine errors
|
|
movq mm3,mm1
|
|
movq mm5,mm1
|
|
movq mm7,mm1
|
|
pmullw mm3,Q [mmW3]
|
|
pmullw mm5,Q [mmW5]
|
|
pmullw mm7,Q [mmW7]
|
|
psrlw mm3,4 // *3/16
|
|
psrlw mm5,4 // *5/16
|
|
psrlw mm7,4 // *7/16
|
|
psubw mm1,mm3
|
|
psubw mm1,mm5
|
|
psubw mm1,mm7 // *rest/16
|
|
packuswb mm1,mm0
|
|
packuswb mm3,mm0
|
|
packuswb mm5,mm0
|
|
packuswb mm7,mm0
|
|
// spread errors
|
|
paddusb mm7,Q [esi+ +4]
|
|
paddusb mm3,Q [esi+ ebx*4 -4]
|
|
paddusb mm5,Q [esi+ ebx*4 +0]
|
|
paddusb mm1,Q [esi+ ebx*4 +4] // !!!! possible memory leak?
|
|
movd D [esi+ +4],mm7
|
|
movd D [esi+ ebx*4 -4],mm3
|
|
movd D [esi+ ebx*4 +0],mm5
|
|
movd D [esi+ ebx*4 +4],mm1
|
|
// advance to next pixel
|
|
add esi,4
|
|
dec ecx
|
|
jnz pixLoopEL
|
|
// advance to next row
|
|
add esi,D [slWidthModulo]
|
|
dec edx
|
|
jz allDoneE
|
|
|
|
// right to left
|
|
mov ecx,D [pixWidth]
|
|
dec ecx
|
|
pixLoopER:
|
|
movd mm1,D [esi]
|
|
punpcklbw mm1,mm0
|
|
pand mm1,Q [mmErrDiffMask]
|
|
// determine errors
|
|
movq mm3,mm1
|
|
movq mm5,mm1
|
|
movq mm7,mm1
|
|
pmullw mm3,Q [mmW3]
|
|
pmullw mm5,Q [mmW5]
|
|
pmullw mm7,Q [mmW7]
|
|
psrlw mm3,4 // *3/16
|
|
psrlw mm5,4 // *5/16
|
|
psrlw mm7,4 // *7/16
|
|
psubw mm1,mm3
|
|
psubw mm1,mm5
|
|
psubw mm1,mm7 // *rest/16
|
|
packuswb mm1,mm0
|
|
packuswb mm3,mm0
|
|
packuswb mm5,mm0
|
|
packuswb mm7,mm0
|
|
// spread errors
|
|
paddusb mm7,Q [esi+ -4]
|
|
paddusb mm1,Q [esi+ ebx*4 -4]
|
|
paddusb mm5,Q [esi+ ebx*4 +0]
|
|
paddusb mm3,Q [esi+ ebx*4 +4] // !!!! possible memory leak?
|
|
movd D [esi+ -4],mm7
|
|
movd D [esi+ ebx*4 -4],mm1
|
|
movd D [esi+ ebx*4 +0],mm5
|
|
movd D [esi+ ebx*4 +4],mm3
|
|
// revert to previous pixel
|
|
sub esi,4
|
|
dec ecx
|
|
jnz pixLoopER
|
|
// advance to next row
|
|
lea esi,[esi+ ebx*4]
|
|
dec edx
|
|
jnz rowLoopE
|
|
allDoneE:
|
|
emms;
|
|
}
|
|
|
|
#elif (defined __GNU_INLINE__)
|
|
__asm__ __volatile__ (
|
|
"pushl %%ebx \n\t" // Save GCC's register.
|
|
"movl %%ecx, %%ebx \n\t"
|
|
"pxor %%mm0, %%mm0 \n\t"
|
|
"decl %%edx \n\t" // need not to dither last row
|
|
|
|
"rowLoopE: \n\t"
|
|
// left to right
|
|
"movl %%eax, %%ecx \n\t"
|
|
"decl %%ecx \n\t"
|
|
|
|
"pixLoopEL: \n\t"
|
|
"movd (%%esi), %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t"
|
|
|
|
// determine errors
|
|
"movq %%mm1, %%mm3 \n\t"
|
|
"movq %%mm1, %%mm5 \n\t"
|
|
"movq %%mm1, %%mm7 \n\t"
|
|
"pmullw (" ASMSYM(mmW3) "), %%mm3 \n\t"
|
|
"pmullw (" ASMSYM(mmW5) "), %%mm5 \n\t"
|
|
"pmullw (" ASMSYM(mmW7) "), %%mm7 \n\t"
|
|
"psrlw $4, %%mm3 \n\t" // *3/16
|
|
"psrlw $4, %%mm5 \n\t" // *5/16
|
|
"psrlw $4, %%mm7 \n\t" // *7/16
|
|
"psubw %%mm3,%%mm1 \n\t"
|
|
"psubw %%mm5,%%mm1 \n\t"
|
|
"psubw %%mm7,%%mm1 \n\t" // *rest/16
|
|
"packuswb %%mm0,%%mm1 \n\t"
|
|
"packuswb %%mm0,%%mm3 \n\t"
|
|
"packuswb %%mm0,%%mm5 \n\t"
|
|
"packuswb %%mm0,%%mm7 \n\t"
|
|
|
|
// spread errors
|
|
"paddusb 4(%%esi), %%mm7 \n\t"
|
|
"paddusb -4(%%esi, %%ebx, 4), %%mm3 \n\t"
|
|
"paddusb 0(%%esi, %%ebx, 4), %%mm5 \n\t"
|
|
"paddusb 4(%%esi, %%ebx, 4), %%mm1 \n\t" // !!!! possible memory leak?
|
|
"movd %%mm7, 4(%%esi) \n\t"
|
|
"movd %%mm3, -4(%%esi, %%ebx, 4) \n\t"
|
|
"movd %%mm5, 0(%%esi, %%ebx, 4) \n\t"
|
|
"movd %%mm1, 4(%%esi, %%ebx, 4) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $4, %%esi \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz pixLoopEL \n\t"
|
|
|
|
// advance to next row
|
|
"addl %%edi, %%esi \n\t"
|
|
"decl %%edx \n\t"
|
|
"jz allDoneE \n\t"
|
|
|
|
// right to left
|
|
"movl %%eax, %%ecx \n\t"
|
|
"decl %%ecx \n\t"
|
|
|
|
"pixLoopER: \n\t"
|
|
"movd (%%esi), %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"pand (" ASMSYM(mmErrDiffMask) "), %%mm1 \n\t"
|
|
|
|
// determine errors
|
|
"movq %%mm1, %%mm3 \n\t"
|
|
"movq %%mm1, %%mm5 \n\t"
|
|
"movq %%mm1, %%mm7 \n\t"
|
|
"pmullw (" ASMSYM(mmW3) "), %%mm3 \n\t"
|
|
"pmullw (" ASMSYM(mmW5) "), %%mm5 \n\t"
|
|
"pmullw (" ASMSYM(mmW7) "), %%mm7 \n\t"
|
|
"psrlw $4, %%mm3 \n\t" // *3/16
|
|
"psrlw $4, %%mm5 \n\t" // *5/16
|
|
"psrlw $4, %%mm7 \n\t" // *7/16
|
|
"psubw %%mm3, %%mm1 \n\t"
|
|
"psubw %%mm5, %%mm1 \n\t"
|
|
"psubw %%mm7, %%mm1 \n\t" // *rest/16
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm3 \n\t"
|
|
"packuswb %%mm0, %%mm5 \n\t"
|
|
"packuswb %%mm0, %%mm7 \n\t"
|
|
|
|
// spread errors
|
|
"paddusb -4(%%esi), %%mm7 \n\t"
|
|
"paddusb -4(%%esi, %%ebx, 4), %%mm1 \n\t"
|
|
"paddusb 0(%%esi, %%ebx, 4), %%mm5 \n\t"
|
|
"paddusb 4(%%esi, %%ebx, 4), %%mm3 \n\t" // !!!! possible memory leak?
|
|
"movd %%mm7, -4(%%esi) \n\t"
|
|
"movd %%mm1, -4(%%esi, %%ebx, 4) \n\t"
|
|
"movd %%mm5, 0(%%esi, %%ebx, 4) \n\t"
|
|
"movd %%mm3, 4(%%esi, %%ebx, 4) \n\t"
|
|
|
|
// revert to previous pixel
|
|
"subl $4, %%esi \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz pixLoopER \n\t"
|
|
|
|
// advance to next row
|
|
"leal 0(%%esi, %%ebx, 4), %%esi \n\t"
|
|
"decl %%edx \n\t"
|
|
"jnz rowLoopE \n\t"
|
|
"allDoneE: \n\t"
|
|
"popl %%ebx \n\t"
|
|
"emms \n\t"
|
|
: // no outputs.
|
|
: "S" (pulDst), "c" (pixCanvasWidth), "d" (pixHeight), "a" (pixWidth),
|
|
"D" (slWidthModulo)
|
|
: "cc", "memory"
|
|
);
|
|
|
|
#else
|
|
#error Write inline asm for your platform.
|
|
#endif
|
|
|
|
goto theEnd;
|
|
|
|
// all done
|
|
theEnd:
|
|
_pfGfxProfile.StopTimer( CGfxProfile::PTI_DITHERBITMAP);
|
|
}
|
|
|
|
|
|
|
|
// performs dithering of a 32-bit mipmaps (can be in-place)
|
|
void DitherMipmaps( INDEX iDitherType, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight)
|
|
{
|
|
// safety check
|
|
ASSERT( pixWidth>0 && pixHeight>0);
|
|
// loop thru mipmaps
|
|
PIX pixMipSize;
|
|
while( pixWidth>0 && pixHeight>0)
|
|
{ // dither one mipmap
|
|
DitherBitmap( iDitherType, pulSrc, pulDst, pixWidth, pixHeight);
|
|
// advance to next mipmap
|
|
pixMipSize = pixWidth*pixHeight;
|
|
pulSrc += pixMipSize;
|
|
pulDst += pixMipSize;
|
|
pixWidth >>=1;
|
|
pixHeight>>=1;
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// blur/sharpen filters
|
|
static INDEX aiFilters[6][3] = {
|
|
{ 0, 1, 16 }, // minimum
|
|
{ 0, 2, 8 }, // low
|
|
{ 1, 2, 7 }, // medium
|
|
{ 1, 2, 3 }, // high
|
|
{ 3, 4, 5 }, // maximum
|
|
{ 1, 1, 1 }}; //
|
|
|
|
// temp for middle pixels, vertical/horizontal edges, and corners
|
|
static __int64 mmMc, mmMe, mmMm; // corner, edge, middle
|
|
static __int64 mmEch, mmEm; // corner-high, middle
|
|
#define mmEcl mmMc // corner-low
|
|
#define mmEe mmMe // edge
|
|
static __int64 mmCm; // middle
|
|
#define mmCc mmMc // corner
|
|
#define mmCe mmEch // edge
|
|
static __int64 mmInvDiv;
|
|
|
|
#if (defined __GNUC__)
|
|
static __int64 mmAdd = 0x0007000700070007ll;
|
|
#else
|
|
static __int64 mmAdd = 0x0007000700070007;
|
|
#endif
|
|
|
|
// temp rows for in-place filtering support
|
|
extern "C" { static ULONG aulRows[2048]; }
|
|
|
|
static void *force_syms_to_exist = NULL;
|
|
void asm_force_mmAdd() { force_syms_to_exist = &mmAdd; }
|
|
void asm_force_aulRows() { force_syms_to_exist = &aulRows; }
|
|
void asm_force_mmMc() { force_syms_to_exist = &mmMc; }
|
|
void asm_force_mmMe() { force_syms_to_exist = &mmMe; }
|
|
void asm_force_mmMm() { force_syms_to_exist = &mmMm; }
|
|
void asm_force_mmEch() { force_syms_to_exist = &mmEch; }
|
|
void asm_force_mmEm() { force_syms_to_exist = &mmEm; }
|
|
void asm_force_mmW3() { force_syms_to_exist = &mmW3; }
|
|
void asm_force_mmW5() { force_syms_to_exist = &mmW5; }
|
|
void asm_force_mmW7() { force_syms_to_exist = &mmW7; }
|
|
|
|
// FilterBitmap() INTERNAL: generates convolution filter matrix if needed
|
|
static INDEX iLastFilter;
|
|
static void GenerateConvolutionMatrix( INDEX iFilter)
|
|
{
|
|
// same as last?
|
|
if( iLastFilter==iFilter) return;
|
|
// update filter
|
|
iLastFilter = iFilter;
|
|
INDEX iFilterAbs = Abs(iFilter) -1;
|
|
// convert convolution values to MMX format
|
|
INDEX iMc = aiFilters[iFilterAbs][0]; // corner
|
|
INDEX iMe = aiFilters[iFilterAbs][1]; // edge
|
|
INDEX iMm = aiFilters[iFilterAbs][2]; // middle
|
|
// negate values for sharpen filter case
|
|
if( iFilter<0) {
|
|
iMm += (iMe+iMc) *8; // (4*Edge + 4*Corner) *2
|
|
iMe = -iMe;
|
|
iMc = -iMc;
|
|
}
|
|
// find values for edge and corner cases
|
|
INDEX iEch = iMc + iMe;
|
|
INDEX iEm = iMm + iMe;
|
|
INDEX iCm = iEch + iEm;
|
|
// prepare divider
|
|
__int64 mm = ((__int64)ceil(65536.0f/(iMc*4+iMe*4+iMm))) & 0xFFFF;
|
|
mmInvDiv = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
// prepare filter values
|
|
mm = iMc & 0xFFFF; mmMc = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
mm = iMe & 0xFFFF; mmMe = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
mm = iMm & 0xFFFF; mmMm = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
mm = iEch & 0xFFFF; mmEch= (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
mm = iEm & 0xFFFF; mmEm = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
mm = iCm & 0xFFFF; mmCm = (mm<<48) | (mm<<32) | (mm<<16) | mm;
|
|
}
|
|
|
|
|
|
extern "C" {
|
|
static ULONG *FB_pulSrc = NULL;
|
|
static ULONG *FB_pulDst = NULL;
|
|
static PIX FB_pixWidth = 0;
|
|
static PIX FB_pixHeight = 0;
|
|
static PIX FB_pixCanvasWidth = 0;
|
|
static SLONG FB_slModulo1 = 0;
|
|
static SLONG FB_slCanvasWidth = 0;
|
|
}
|
|
|
|
|
|
#if USE_PORTABLE_C
|
|
typedef SWORD ExtPix[4];
|
|
|
|
static inline void extpix_fromi64(ExtPix &pix, const __int64 i64)
|
|
{
|
|
//memcpy(pix, i64, sizeof (ExtPix));
|
|
pix[0] = ((i64 >> 0) & 0xFFFF);
|
|
pix[1] = ((i64 >> 16) & 0xFFFF);
|
|
pix[2] = ((i64 >> 32) & 0xFFFF);
|
|
pix[3] = ((i64 >> 48) & 0xFFFF);
|
|
}
|
|
|
|
static inline void extend_pixel(const ULONG ul, ExtPix &pix)
|
|
{
|
|
pix[0] = ((ul >> 0) & 0xFF);
|
|
pix[1] = ((ul >> 8) & 0xFF);
|
|
pix[2] = ((ul >> 16) & 0xFF);
|
|
pix[3] = ((ul >> 24) & 0xFF);
|
|
}
|
|
|
|
static inline ULONG unextend_pixel(const ExtPix &pix)
|
|
{
|
|
return
|
|
(
|
|
(((ULONG) ((pix[0] >= 255) ? 255 : ((pix[0] <= 0) ? 0 : pix[0]))) << 0) |
|
|
(((ULONG) ((pix[1] >= 255) ? 255 : ((pix[1] <= 0) ? 0 : pix[1]))) << 8) |
|
|
(((ULONG) ((pix[2] >= 255) ? 255 : ((pix[2] <= 0) ? 0 : pix[2]))) << 16) |
|
|
(((ULONG) ((pix[3] >= 255) ? 255 : ((pix[3] <= 0) ? 0 : pix[3]))) << 24)
|
|
);
|
|
}
|
|
|
|
static inline void extpix_add(ExtPix &p1, const ExtPix &p2)
|
|
{
|
|
p1[0] = (SWORD) (((SLONG) p1[0]) + ((SLONG) p2[0]));
|
|
p1[1] = (SWORD) (((SLONG) p1[1]) + ((SLONG) p2[1]));
|
|
p1[2] = (SWORD) (((SLONG) p1[2]) + ((SLONG) p2[2]));
|
|
p1[3] = (SWORD) (((SLONG) p1[3]) + ((SLONG) p2[3]));
|
|
}
|
|
|
|
static inline void extpix_mul(ExtPix &p1, const ExtPix &p2)
|
|
{
|
|
p1[0] = (SWORD) (((SLONG) p1[0]) * ((SLONG) p2[0]));
|
|
p1[1] = (SWORD) (((SLONG) p1[1]) * ((SLONG) p2[1]));
|
|
p1[2] = (SWORD) (((SLONG) p1[2]) * ((SLONG) p2[2]));
|
|
p1[3] = (SWORD) (((SLONG) p1[3]) * ((SLONG) p2[3]));
|
|
}
|
|
|
|
static inline void extpix_adds(ExtPix &p1, const ExtPix &p2)
|
|
{
|
|
SLONG x0 = (((SLONG) ((SWORD) p1[0])) + ((SLONG) ((SWORD) p2[0])));
|
|
SLONG x1 = (((SLONG) ((SWORD) p1[1])) + ((SLONG) ((SWORD) p2[1])));
|
|
SLONG x2 = (((SLONG) ((SWORD) p1[2])) + ((SLONG) ((SWORD) p2[2])));
|
|
SLONG x3 = (((SLONG) ((SWORD) p1[3])) + ((SLONG) ((SWORD) p2[3])));
|
|
|
|
p1[0] = (SWORD) ((x0 <= -32768) ? -32768 : ((x0 >= 32767) ? 32767 : x0));
|
|
p1[1] = (SWORD) ((x1 <= -32768) ? -32768 : ((x1 >= 32767) ? 32767 : x1));
|
|
p1[2] = (SWORD) ((x2 <= -32768) ? -32768 : ((x2 >= 32767) ? 32767 : x2));
|
|
p1[3] = (SWORD) ((x3 <= -32768) ? -32768 : ((x3 >= 32767) ? 32767 : x3));
|
|
}
|
|
|
|
static inline void extpix_mulhi(ExtPix &p1, const ExtPix &p2)
|
|
{
|
|
p1[0] = (SWORD) (((((SLONG) p1[0]) * ((SLONG) p2[0])) >> 16) & 0xFFFF);
|
|
p1[1] = (SWORD) (((((SLONG) p1[1]) * ((SLONG) p2[1])) >> 16) & 0xFFFF);
|
|
p1[2] = (SWORD) (((((SLONG) p1[2]) * ((SLONG) p2[2])) >> 16) & 0xFFFF);
|
|
p1[3] = (SWORD) (((((SLONG) p1[3]) * ((SLONG) p2[3])) >> 16) & 0xFFFF);
|
|
}
|
|
#endif
|
|
|
|
|
|
// applies filter to bitmap
|
|
void FilterBitmap( INDEX iFilter, ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight,
|
|
PIX pixCanvasWidth, PIX pixCanvasHeight)
|
|
{
|
|
_pfGfxProfile.StartTimer( CGfxProfile::PTI_FILTERBITMAP);
|
|
ASSERT( iFilter>=-6 && iFilter<=+6);
|
|
|
|
// adjust canvas size
|
|
if( pixCanvasWidth ==0) pixCanvasWidth = pixWidth;
|
|
if( pixCanvasHeight==0) pixCanvasHeight = pixHeight;
|
|
ASSERT( pixCanvasWidth>=pixWidth && pixCanvasHeight>=pixHeight);
|
|
|
|
// if bitmap is smaller than 4x4
|
|
if( pixWidth<4 || pixHeight<4)
|
|
{ // don't blur it at all, but eventually only copy
|
|
if( pulDst!=pulSrc) memcpy( pulDst, pulSrc, pixCanvasWidth*pixCanvasHeight *BYTES_PER_TEXEL);
|
|
_pfGfxProfile.StopTimer( CGfxProfile::PTI_FILTERBITMAP);
|
|
return;
|
|
}
|
|
|
|
// prepare convolution matrix and row modulo
|
|
iFilter = Clamp( iFilter, -6L, +6L);
|
|
GenerateConvolutionMatrix( iFilter);
|
|
SLONG slModulo1 = (pixCanvasWidth-pixWidth+1) *BYTES_PER_TEXEL;
|
|
SLONG slCanvasWidth = pixCanvasWidth *BYTES_PER_TEXEL;
|
|
|
|
// lets roll ...
|
|
#if (defined USE_MMX_INTRINSICS)
|
|
slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
|
|
slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
|
|
|
|
ULONG *src = pulSrc;
|
|
ULONG *dst = pulDst;
|
|
ULONG *rowptr = aulRows;
|
|
|
|
__m64 rmm0 = _mm_setzero_si64();
|
|
__m64 rmmCm = _mm_set_pi32(((int *)((char*)&mmCm))[0],((int *)((char*)&mmCm))[1]);
|
|
__m64 rmmCe = _mm_set_pi32(((int *)((char*)&mmCe))[0],((int *)((char*)&mmCe))[1]);
|
|
__m64 rmmCc = _mm_set_pi32(((int *)((char*)&mmCc))[0],((int *)((char*)&mmCc))[1]);
|
|
__m64 rmmEch = _mm_set_pi32(((int *)((char*)&mmEch))[0],((int *)((char*)&mmEch))[1]);
|
|
__m64 rmmEcl = _mm_set_pi32(((int *)((char*)&mmEcl))[0],((int *)((char*)&mmEcl))[1]);
|
|
__m64 rmmEe = _mm_set_pi32(((int *)((char*)&mmEe))[0],((int *)((char*)&mmEe))[1]);
|
|
__m64 rmmEm = _mm_set_pi32(((int *)((char*)&mmEm))[0],((int *)((char*)&mmEm))[1]);
|
|
__m64 rmmMm = _mm_set_pi32(((int *)((char*)&mmMm))[0],((int *)((char*)&mmMm))[1]);
|
|
__m64 rmmMe = _mm_set_pi32(((int *)((char*)&mmMe))[0],((int *)((char*)&mmMe))[1]);
|
|
__m64 rmmMc = _mm_set_pi32(((int *)((char*)&mmMc))[0],((int *)((char*)&mmMc))[1]);
|
|
__m64 rmmAdd = _mm_set_pi32(((int *)((char*)&mmAdd))[0],((int *)((char*)&mmAdd))[1]);
|
|
__m64 rmmInvDiv = _mm_set_pi32(((int *)((char*)&mmInvDiv))[0],((int *)((char*)&mmInvDiv))[1]);
|
|
|
|
// ----------------------- process upper left corner
|
|
__m64 rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
__m64 rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
__m64 rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
__m64 rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth+1]), rmm0);
|
|
__m64 rmm5 = _mm_setzero_si64();
|
|
__m64 rmm6 = _mm_setzero_si64();
|
|
__m64 rmm7 = _mm_setzero_si64();
|
|
|
|
rmm2 = _mm_add_pi16(rmm2, rmm3);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmCm);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmCe);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmCc);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
*(rowptr++) = _mm_cvtsi64_si32(rmm1);
|
|
src++;
|
|
|
|
// ----------------------- process upper edge pixels
|
|
for (PIX i = pixWidth - 2; i != 0; i--)
|
|
{
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth-1]), rmm0);
|
|
rmm5 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
rmm6 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth+1]), rmm0);
|
|
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm4 = _mm_add_pi16(rmm4, rmm6);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmEch);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmEm);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmEcl);
|
|
rmm5 = _mm_mullo_pi16(rmm5, rmmEe);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm5);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
*(rowptr++) = _mm_cvtsi64_si32(rmm1);
|
|
src++;
|
|
}
|
|
|
|
// ----------------------- process upper right corner
|
|
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth-1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmCe);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmCm);
|
|
rmm3 = _mm_mullo_pi16(rmm3, rmmCc);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
*rowptr = _mm_cvtsi64_si32(rmm1);
|
|
|
|
// ----------------------- process bitmap middle pixels
|
|
|
|
dst += slCanvasWidth;
|
|
src += slModulo1;
|
|
|
|
// for each row
|
|
for (size_t i = pixHeight-2; i != 0; i--) // rowLoop
|
|
{
|
|
rowptr = aulRows;
|
|
|
|
// process left edge pixel
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)+1]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
rmm5 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
rmm6 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth+1]), rmm0);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm5);
|
|
rmm2 = _mm_add_pi16(rmm2, rmm6);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmEch);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmEcl);
|
|
rmm3 = _mm_mullo_pi16(rmm3, rmmEm);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmEe);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*(rowptr++) = _mm_cvtsi64_si32(rmm1);
|
|
src++;
|
|
dst++;
|
|
|
|
// for each pixel in current row
|
|
for (size_t j = pixWidth-2; j != 0; j--) // pixLoop
|
|
{
|
|
// prepare upper convolution row
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)+1]), rmm0);
|
|
|
|
// prepare middle convolution row
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm5 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm6 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
|
|
// free some registers
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm2 = _mm_add_pi16(rmm2, rmm4);
|
|
rmm5 = _mm_mullo_pi16(rmm5, rmmMm);
|
|
|
|
// prepare lower convolution row
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth-1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
rmm7 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth+1]), rmm0);
|
|
|
|
// calc weightened value
|
|
rmm2 = _mm_add_pi16(rmm2, rmm6);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm2 = _mm_add_pi16(rmm2, rmm4);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm7);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmMe);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmMc);
|
|
rmm2 = _mm_add_pi16(rmm2, rmm5);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
|
|
// calc and store wightened value
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*(rowptr++) = _mm_cvtsi64_si32(rmm1);
|
|
|
|
// advance to next pixel
|
|
src++;
|
|
dst++;
|
|
}
|
|
|
|
// process right edge pixel
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm5 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth-1]), rmm0);
|
|
rmm6 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[pixCanvasWidth]), rmm0);
|
|
|
|
rmm1 = _mm_add_pi16(rmm1, rmm5);
|
|
rmm2 = _mm_add_pi16(rmm2, rmm6);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmEcl);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmEch);
|
|
rmm3 = _mm_mullo_pi16(rmm3, rmmEe);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmEm);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*rowptr = _mm_cvtsi64_si32(rmm1);
|
|
|
|
// advance to next row
|
|
src += slModulo1;
|
|
dst += slModulo1;
|
|
}
|
|
|
|
// ----------------------- process lower left corner
|
|
rowptr = aulRows;
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)+1]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmCe);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmCc);
|
|
rmm3 = _mm_mullo_pi16(rmm3, rmmCm);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = _mm_cvtsi64_si32(rmm1);
|
|
|
|
src++;
|
|
dst++;
|
|
rowptr++;
|
|
|
|
// ----------------------- process lower edge pixels
|
|
for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop
|
|
{
|
|
// for each pixel
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)+1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm5 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
rmm6 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[1]), rmm0);
|
|
|
|
rmm1 = _mm_add_pi16(rmm1, rmm3);
|
|
rmm4 = _mm_add_pi16(rmm4, rmm6);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmEcl);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmEe);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmEch);
|
|
rmm5 = _mm_mullo_pi16(rmm5, rmmEm);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm5);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = _mm_cvtsi64_si32(rmm1);
|
|
|
|
// advance to next pixel
|
|
src++;
|
|
dst++;
|
|
rowptr++;
|
|
}
|
|
|
|
// ----------------------- lower right corners
|
|
rmm1 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[(-pixCanvasWidth)-1]), rmm0);
|
|
rmm2 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-pixCanvasWidth]), rmm0);
|
|
rmm3 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[-1]), rmm0);
|
|
rmm4 = _mm_unpacklo_pi8(_mm_cvtsi32_si64(src[0]), rmm0);
|
|
|
|
rmm2 = _mm_add_pi16(rmm2, rmm3);
|
|
rmm1 = _mm_mullo_pi16(rmm1, rmmCc);
|
|
rmm2 = _mm_mullo_pi16(rmm2, rmmCe);
|
|
rmm4 = _mm_mullo_pi16(rmm4, rmmCm);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm2);
|
|
rmm1 = _mm_add_pi16(rmm1, rmm4);
|
|
rmm1 = _mm_adds_pi16(rmm1, rmmAdd);
|
|
rmm1 = _mm_mulhi_pi16(rmm1, rmmInvDiv);
|
|
rmm1 = _mm_packs_pu16(rmm1, rmm0);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = _mm_cvtsi64_si32(rmm1);
|
|
|
|
_mm_empty(); // we're done, clear out the MMX registers!
|
|
|
|
|
|
#elif (defined USE_PORTABLE_C)
|
|
slModulo1 /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
|
|
slCanvasWidth /= BYTES_PER_TEXEL; // C++ handles incrementing by sizeof type
|
|
|
|
ULONG *src = pulSrc;
|
|
ULONG *dst = pulDst;
|
|
ULONG *rowptr = aulRows;
|
|
|
|
ExtPix rmm1, rmm2, rmm3, rmm4, rmm5, rmm6, rmm7;
|
|
#define EXTPIXFROMINT64(x) ExtPix r##x; extpix_fromi64(r##x, x);
|
|
EXTPIXFROMINT64(mmCm);
|
|
EXTPIXFROMINT64(mmCe);
|
|
EXTPIXFROMINT64(mmCc);
|
|
EXTPIXFROMINT64(mmEch);
|
|
EXTPIXFROMINT64(mmEcl);
|
|
EXTPIXFROMINT64(mmEe);
|
|
EXTPIXFROMINT64(mmEm);
|
|
EXTPIXFROMINT64(mmMm);
|
|
EXTPIXFROMINT64(mmMe);
|
|
EXTPIXFROMINT64(mmMc);
|
|
EXTPIXFROMINT64(mmAdd);
|
|
EXTPIXFROMINT64(mmInvDiv);
|
|
#undef EXTPIXFROMINT64
|
|
|
|
// ----------------------- process upper left corner
|
|
extend_pixel(src[0], rmm1);
|
|
extend_pixel(src[1], rmm2);
|
|
extend_pixel(src[pixCanvasWidth], rmm3);
|
|
extend_pixel(src[pixCanvasWidth+1], rmm4);
|
|
|
|
extpix_add(rmm2, rmm3);
|
|
extpix_mul(rmm1, rmmCm);
|
|
extpix_mul(rmm2, rmmCe);
|
|
extpix_mul(rmm4, rmmCc);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
*(rowptr++) = unextend_pixel(rmm1);
|
|
|
|
src++;
|
|
|
|
// ----------------------- process upper edge pixels
|
|
for (PIX i = pixWidth - 2; i != 0; i--)
|
|
{
|
|
extend_pixel(src[-1], rmm1);
|
|
extend_pixel(src[0], rmm2);
|
|
extend_pixel(src[1], rmm3);
|
|
extend_pixel(src[pixCanvasWidth-1], rmm4);
|
|
extend_pixel(src[pixCanvasWidth], rmm5);
|
|
extend_pixel(src[pixCanvasWidth+1], rmm6);
|
|
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm4, rmm6);
|
|
extpix_mul(rmm1, rmmEch);
|
|
extpix_mul(rmm2, rmmEm);
|
|
extpix_mul(rmm4, rmmEcl);
|
|
extpix_mul(rmm5, rmmEe);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_add(rmm1, rmm5);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
*(rowptr++) = unextend_pixel(rmm1);
|
|
src++;
|
|
}
|
|
|
|
// ----------------------- process upper right corner
|
|
|
|
extend_pixel(src[-1], rmm1);
|
|
extend_pixel(src[0], rmm2);
|
|
extend_pixel(src[pixCanvasWidth-1], rmm3);
|
|
extend_pixel(src[pixCanvasWidth], rmm4);
|
|
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_mul(rmm1, rmmCe);
|
|
extpix_mul(rmm2, rmmCm);
|
|
extpix_mul(rmm3, rmmCc);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
*rowptr = unextend_pixel(rmm1);
|
|
|
|
// ----------------------- process bitmap middle pixels
|
|
|
|
dst += slCanvasWidth;
|
|
src += slModulo1;
|
|
|
|
// for each row
|
|
for (size_t i = pixHeight-2; i != 0; i--) // rowLoop
|
|
{
|
|
rowptr = aulRows;
|
|
|
|
// process left edge pixel
|
|
extend_pixel(src[-pixCanvasWidth], rmm1);
|
|
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
|
|
extend_pixel(src[0], rmm3);
|
|
extend_pixel(src[1], rmm4);
|
|
extend_pixel(src[pixCanvasWidth], rmm5);
|
|
extend_pixel(src[pixCanvasWidth+1], rmm6);
|
|
|
|
extpix_add(rmm1, rmm5);
|
|
extpix_add(rmm2, rmm6);
|
|
extpix_mul(rmm1, rmmEch);
|
|
extpix_mul(rmm2, rmmEcl);
|
|
extpix_mul(rmm3, rmmEm);
|
|
extpix_mul(rmm4, rmmEe);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*(rowptr++) = unextend_pixel(rmm1);
|
|
src++;
|
|
dst++;
|
|
|
|
// for each pixel in current row
|
|
for (size_t j = pixWidth-2; j != 0; j--) // pixLoop
|
|
{
|
|
// prepare upper convolution row
|
|
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
|
|
extend_pixel(src[-pixCanvasWidth], rmm2);
|
|
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
|
|
|
|
// prepare middle convolution row
|
|
extend_pixel(src[-1], rmm4);
|
|
extend_pixel(src[0], rmm5);
|
|
extend_pixel(src[1], rmm6);
|
|
|
|
// free some registers
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm2, rmm4);
|
|
extpix_mul(rmm5, rmmMm);
|
|
|
|
// prepare lower convolution row
|
|
extend_pixel(src[pixCanvasWidth-1], rmm3);
|
|
extend_pixel(src[pixCanvasWidth], rmm4);
|
|
extend_pixel(src[pixCanvasWidth+1], rmm7);
|
|
|
|
// calc weightened value
|
|
extpix_add(rmm2, rmm6);
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm2, rmm4);
|
|
extpix_add(rmm1, rmm7);
|
|
extpix_mul(rmm2, rmmMe);
|
|
extpix_mul(rmm1, rmmMc);
|
|
extpix_add(rmm2, rmm5);
|
|
extpix_add(rmm1, rmm2);
|
|
|
|
// calc and store wightened value
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*(rowptr++) = unextend_pixel(rmm1);
|
|
|
|
// advance to next pixel
|
|
src++;
|
|
dst++;
|
|
}
|
|
|
|
// process right edge pixel
|
|
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
|
|
extend_pixel(src[-pixCanvasWidth], rmm2);
|
|
extend_pixel(src[-1], rmm3);
|
|
extend_pixel(src[0], rmm4);
|
|
extend_pixel(src[pixCanvasWidth-1], rmm5);
|
|
extend_pixel(src[pixCanvasWidth], rmm6);
|
|
|
|
extpix_add(rmm1, rmm5);
|
|
extpix_add(rmm2, rmm6);
|
|
extpix_mul(rmm1, rmmEcl);
|
|
extpix_mul(rmm2, rmmEch);
|
|
extpix_mul(rmm3, rmmEe);
|
|
extpix_mul(rmm4, rmmEm);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
*rowptr = unextend_pixel(rmm1);
|
|
|
|
// advance to next row
|
|
src += slModulo1;
|
|
dst += slModulo1;
|
|
}
|
|
|
|
// ----------------------- process lower left corner
|
|
rowptr = aulRows;
|
|
extend_pixel(src[-pixCanvasWidth], rmm1);
|
|
extend_pixel(src[(-pixCanvasWidth)+1], rmm2);
|
|
extend_pixel(src[0], rmm3);
|
|
extend_pixel(src[1], rmm4);
|
|
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_mul(rmm1, rmmCe);
|
|
extpix_mul(rmm2, rmmCc);
|
|
extpix_mul(rmm3, rmmCm);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = unextend_pixel(rmm1);
|
|
|
|
src++;
|
|
dst++;
|
|
rowptr++;
|
|
|
|
// ----------------------- process lower edge pixels
|
|
for (size_t i = pixWidth-2; i != 0; i--) // lowerLoop
|
|
{
|
|
// for each pixel
|
|
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
|
|
extend_pixel(src[-pixCanvasWidth], rmm2);
|
|
extend_pixel(src[(-pixCanvasWidth)+1], rmm3);
|
|
extend_pixel(src[-1], rmm4);
|
|
extend_pixel(src[0], rmm5);
|
|
extend_pixel(src[1], rmm6);
|
|
|
|
extpix_add(rmm1, rmm3);
|
|
extpix_add(rmm4, rmm6);
|
|
extpix_mul(rmm1, rmmEcl);
|
|
extpix_mul(rmm2, rmmEe);
|
|
extpix_mul(rmm4, rmmEch);
|
|
extpix_mul(rmm5, rmmEm);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_add(rmm1, rmm5);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = unextend_pixel(rmm1);
|
|
|
|
// advance to next pixel
|
|
src++;
|
|
dst++;
|
|
rowptr++;
|
|
}
|
|
|
|
// ----------------------- lower right corners
|
|
extend_pixel(src[(-pixCanvasWidth)-1], rmm1);
|
|
extend_pixel(src[-pixCanvasWidth], rmm2);
|
|
extend_pixel(src[-1], rmm3);
|
|
extend_pixel(src[0], rmm4);
|
|
|
|
extpix_add(rmm2, rmm3);
|
|
extpix_mul(rmm1, rmmCc);
|
|
extpix_mul(rmm2, rmmCe);
|
|
extpix_mul(rmm4, rmmCm);
|
|
extpix_add(rmm1, rmm2);
|
|
extpix_add(rmm1, rmm4);
|
|
extpix_adds(rmm1, rmmAdd);
|
|
extpix_mulhi(rmm1, rmmInvDiv);
|
|
dst[-pixCanvasWidth] = *rowptr;
|
|
dst[0] = unextend_pixel(rmm1);
|
|
|
|
#elif (defined __MSVC_INLINE__)
|
|
__asm {
|
|
cld
|
|
mov eax,D [pixCanvasWidth] // EAX = positive row offset
|
|
mov edx,eax
|
|
neg edx // EDX = negative row offset
|
|
pxor mm0,mm0
|
|
mov esi,D [pulSrc]
|
|
mov edi,D [pulDst]
|
|
xor ebx,ebx
|
|
|
|
// ----------------------- process upper left corner
|
|
|
|
movd mm1,D [esi+ +0]
|
|
movd mm2,D [esi+ +4]
|
|
movd mm3,D [esi+ eax*4 +0]
|
|
movd mm4,D [esi+ eax*4 +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
paddw mm2,mm3
|
|
pmullw mm1,Q [mmCm]
|
|
pmullw mm2,Q [mmCe]
|
|
pmullw mm4,Q [mmCc]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm4
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd D [ebx+ aulRows],mm1
|
|
add esi,4
|
|
add ebx,4
|
|
|
|
// ----------------------- process upper edge pixels
|
|
|
|
mov ecx,D [pixWidth]
|
|
sub ecx,2
|
|
// for each pixel
|
|
upperLoop:
|
|
movd mm1,D [esi+ -4]
|
|
movd mm2,D [esi+ +0]
|
|
movd mm3,D [esi+ +4]
|
|
movd mm4,D [esi+ eax*4 -4]
|
|
movd mm5,D [esi+ eax*4 +0]
|
|
movd mm6,D [esi+ eax*4 +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm5,mm0
|
|
punpcklbw mm6,mm0
|
|
paddw mm1,mm3
|
|
paddw mm4,mm6
|
|
pmullw mm1,Q [mmEch]
|
|
pmullw mm2,Q [mmEm]
|
|
pmullw mm4,Q [mmEcl]
|
|
pmullw mm5,Q [mmEe]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm4
|
|
paddw mm1,mm5
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd D [ebx+ aulRows],mm1
|
|
// advance to next pixel
|
|
add esi,4
|
|
add ebx,4
|
|
dec ecx
|
|
jnz upperLoop
|
|
|
|
// ----------------------- process upper right corner
|
|
|
|
movd mm1,D [esi+ -4]
|
|
movd mm2,D [esi+ +0]
|
|
movd mm3,D [esi+ eax*4 -4]
|
|
movd mm4,D [esi+ eax*4 +0]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
paddw mm1,mm4
|
|
pmullw mm1,Q [mmCe]
|
|
pmullw mm2,Q [mmCm]
|
|
pmullw mm3,Q [mmCc]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm3
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd D [ebx+ aulRows],mm1
|
|
|
|
// ----------------------- process bitmap middle pixels
|
|
|
|
add esi,D [slModulo1]
|
|
add edi,D [slCanvasWidth]
|
|
mov ebx,D [pixHeight]
|
|
sub ebx,2
|
|
// for each row
|
|
rowLoop:
|
|
push ebx
|
|
xor ebx,ebx
|
|
// process left edge pixel
|
|
movd mm1,D [esi+ edx*4 +0]
|
|
movd mm2,D [esi+ edx*4 +4]
|
|
movd mm3,D [esi+ +0]
|
|
movd mm4,D [esi+ +4]
|
|
movd mm5,D [esi+ eax*4 +0]
|
|
movd mm6,D [esi+ eax*4 +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm5,mm0
|
|
punpcklbw mm6,mm0
|
|
paddw mm1,mm5
|
|
paddw mm2,mm6
|
|
pmullw mm1,Q [mmEch]
|
|
pmullw mm2,Q [mmEcl]
|
|
pmullw mm3,Q [mmEm]
|
|
pmullw mm4,Q [mmEe]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm3
|
|
paddw mm1,mm4
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [ebx+ aulRows],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
add esi,4
|
|
add edi,4
|
|
add ebx,4
|
|
|
|
// for each pixel in current row
|
|
mov ecx,D [pixWidth]
|
|
sub ecx,2
|
|
pixLoop:
|
|
// prepare upper convolution row
|
|
movd mm1,D [esi+ edx*4 -4]
|
|
movd mm2,D [esi+ edx*4 +0]
|
|
movd mm3,D [esi+ edx*4 +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
// prepare middle convolution row
|
|
movd mm4,D [esi+ -4]
|
|
movd mm5,D [esi+ +0]
|
|
movd mm6,D [esi+ +4]
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm5,mm0
|
|
punpcklbw mm6,mm0
|
|
// free some registers
|
|
paddw mm1,mm3
|
|
paddw mm2,mm4
|
|
pmullw mm5,Q [mmMm]
|
|
// prepare lower convolution row
|
|
movd mm3,D [esi+ eax*4 -4]
|
|
movd mm4,D [esi+ eax*4 +0]
|
|
movd mm7,D [esi+ eax*4 +4]
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm7,mm0
|
|
// calc weightened value
|
|
paddw mm2,mm6
|
|
paddw mm1,mm3
|
|
paddw mm2,mm4
|
|
paddw mm1,mm7
|
|
pmullw mm2,Q [mmMe]
|
|
pmullw mm1,Q [mmMc]
|
|
paddw mm2,mm5
|
|
paddw mm1,mm2
|
|
// calc and store wightened value
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [ebx+ aulRows],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
// advance to next pixel
|
|
add esi,4
|
|
add edi,4
|
|
add ebx,4
|
|
dec ecx
|
|
jnz pixLoop
|
|
|
|
// process right edge pixel
|
|
movd mm1,D [esi+ edx*4 -4]
|
|
movd mm2,D [esi+ edx*4 +0]
|
|
movd mm3,D [esi+ -4]
|
|
movd mm4,D [esi+ +0]
|
|
movd mm5,D [esi+ eax*4 -4]
|
|
movd mm6,D [esi+ eax*4 +0]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm5,mm0
|
|
punpcklbw mm6,mm0
|
|
paddw mm1,mm5
|
|
paddw mm2,mm6
|
|
pmullw mm1,Q [mmEcl]
|
|
pmullw mm2,Q [mmEch]
|
|
pmullw mm3,Q [mmEe]
|
|
pmullw mm4,Q [mmEm]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm3
|
|
paddw mm1,mm4
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [ebx+ aulRows],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
// advance to next row
|
|
add esi,D [slModulo1]
|
|
add edi,D [slModulo1]
|
|
pop ebx
|
|
dec ebx
|
|
jnz rowLoop
|
|
|
|
// ----------------------- process lower left corner
|
|
|
|
xor ebx,ebx
|
|
movd mm1,D [esi+ edx*4 +0]
|
|
movd mm2,D [esi+ edx*4 +4]
|
|
movd mm3,D [esi+ +0]
|
|
movd mm4,D [esi+ +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
paddw mm1,mm4
|
|
pmullw mm1,Q [mmCe]
|
|
pmullw mm2,Q [mmCc]
|
|
pmullw mm3,Q [mmCm]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm3
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [edi],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
add esi,4
|
|
add edi,4
|
|
add ebx,4
|
|
|
|
// ----------------------- process lower edge pixels
|
|
|
|
mov ecx,D [pixWidth]
|
|
sub ecx,2
|
|
// for each pixel
|
|
lowerLoop:
|
|
movd mm1,D [esi+ edx*4 -4]
|
|
movd mm2,D [esi+ edx*4 +0]
|
|
movd mm3,D [esi+ edx*4 +4]
|
|
movd mm4,D [esi+ -4]
|
|
movd mm5,D [esi+ +0]
|
|
movd mm6,D [esi+ +4]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
punpcklbw mm5,mm0
|
|
punpcklbw mm6,mm0
|
|
paddw mm1,mm3
|
|
paddw mm4,mm6
|
|
pmullw mm1,Q [mmEcl]
|
|
pmullw mm2,Q [mmEe]
|
|
pmullw mm4,Q [mmEch]
|
|
pmullw mm5,Q [mmEm]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm4
|
|
paddw mm1,mm5
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [edi],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
// advance to next pixel
|
|
add esi,4
|
|
add edi,4
|
|
add ebx,4
|
|
dec ecx
|
|
jnz lowerLoop
|
|
|
|
// ----------------------- lower right corners
|
|
|
|
movd mm1,D [esi+ edx*4 -4]
|
|
movd mm2,D [esi+ edx*4 +0]
|
|
movd mm3,D [esi+ -4]
|
|
movd mm4,D [esi+ +0]
|
|
punpcklbw mm1,mm0
|
|
punpcklbw mm2,mm0
|
|
punpcklbw mm3,mm0
|
|
punpcklbw mm4,mm0
|
|
paddw mm2,mm3
|
|
pmullw mm1,Q [mmCc]
|
|
pmullw mm2,Q [mmCe]
|
|
pmullw mm4,Q [mmCm]
|
|
paddw mm1,mm2
|
|
paddw mm1,mm4
|
|
paddsw mm1,Q [mmAdd]
|
|
pmulhw mm1,Q [mmInvDiv]
|
|
packuswb mm1,mm0
|
|
movd mm2,D [ebx+ aulRows]
|
|
movd D [edi],mm1
|
|
movd D [edi+ edx*4],mm2
|
|
emms
|
|
}
|
|
|
|
#elif (defined __GNU_INLINE__)
|
|
|
|
FB_pulSrc = pulSrc;
|
|
FB_pulDst = pulDst;
|
|
FB_pixWidth = pixWidth;
|
|
FB_pixHeight = pixHeight;
|
|
FB_pixCanvasWidth = pixCanvasWidth;
|
|
FB_slModulo1 = slModulo1;
|
|
FB_slCanvasWidth = slCanvasWidth;
|
|
|
|
__asm__ __volatile__ (
|
|
"pushl %%ebx \n\t"
|
|
"cld \n\t"
|
|
"movl (" ASMSYM(FB_pixCanvasWidth) "), %%eax \n\t" // EAX = positive row offset
|
|
"movl %%eax, %%edx \n\t"
|
|
"negl %%edx \n\t" // EDX = negative row offset
|
|
"pxor %%mm0, %%mm0 \n\t"
|
|
"movl (" ASMSYM(FB_pulSrc) "), %%esi \n\t"
|
|
"movl (" ASMSYM(FB_pulDst) "), %%edi \n\t"
|
|
"xorl %%ebx, %%ebx \n\t"
|
|
|
|
// ----------------------- process upper left corner
|
|
|
|
"movd 0(%%esi), %%mm1 \n\t"
|
|
"movd 4(%%esi), %%mm2 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm3 \n\t"
|
|
"movd 4(%%esi, %%eax, 4), %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"paddw %%mm3, %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmCm) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm4 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
"add $4, %%esi \n\t"
|
|
"add $4, %%ebx \n\t"
|
|
|
|
// ----------------------- process upper edge pixels
|
|
|
|
"movl (" ASMSYM(FB_pixWidth) "), %%ecx \n\t"
|
|
"subl $2, %%ecx \n\t"
|
|
|
|
// for each pixel
|
|
"0: \n\t" // upperLoop
|
|
"movd -4(%%esi), %%mm1 \n\t"
|
|
"movd 0(%%esi), %%mm2 \n\t"
|
|
"movd 4(%%esi), %%mm3 \n\t"
|
|
"movd -4(%%esi, %%eax, 4), %%mm4 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm5 \n\t"
|
|
"movd 4(%%esi, %%eax, 4), %%mm6 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm5 \n\t"
|
|
"punpcklbw %%mm0, %%mm6 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm6, %%mm4 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEm) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm4 \n\t"
|
|
"pmullw (" ASMSYM(mmMe) "), %%mm5 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddw %%mm5, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
|
|
// advance to next pixel
|
|
"addl $4, %%esi \n\t"
|
|
"addl $4, %%ebx \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz 0b \n\t" // upperLoop
|
|
|
|
// ----------------------- process upper right corner
|
|
|
|
"movd -4(%%esi), %%mm1 \n\t"
|
|
"movd 0(%%esi), %%mm2 \n\t"
|
|
"movd -4(%%esi, %%eax, 4), %%mm3 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmCm) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm3 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
|
|
// ----------------------- process bitmap middle pixels
|
|
|
|
"addl (" ASMSYM(FB_slModulo1) "), %%esi \n\t"
|
|
"addl (" ASMSYM(FB_slCanvasWidth) "), %%edi \n\t"
|
|
"movl (" ASMSYM(FB_pixHeight) "), %%ebx \n\t"
|
|
"subl $2, %%ebx \n\t"
|
|
|
|
// for each row
|
|
"1: \n\t" // rowLoop
|
|
"pushl %%ebx \n\t"
|
|
"xorl %%ebx, %%ebx \n\t"
|
|
// process left edge pixel
|
|
"movd 0(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 4(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd 0(%%esi), %%mm3 \n\t"
|
|
"movd 4(%%esi), %%mm4 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm5 \n\t"
|
|
"movd 4(%%esi, %%eax, 4), %%mm6 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm5 \n\t"
|
|
"punpcklbw %%mm0, %%mm6 \n\t"
|
|
"paddw %%mm5, %%mm1 \n\t"
|
|
"paddw %%mm6, %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmEm) "), %%mm3 \n\t"
|
|
"pmullw (" ASMSYM(mmMe) "), %%mm4 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
"movd %%mm2, 0(%%edi, %%edx, 4) \n\t"
|
|
"add $4, %%esi \n\t"
|
|
"add $4, %%edi \n\t"
|
|
"add $4, %%ebx \n\t"
|
|
|
|
// for each pixel in current row
|
|
"mov (" ASMSYM(FB_pixWidth) "), %%ecx \n\t"
|
|
"sub $2, %%ecx \n\t"
|
|
"2: \n\t" // pixLoop
|
|
// prepare upper convolution row
|
|
"movd -4(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 0(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd 4(%%esi, %%edx, 4), %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
// prepare middle convolution row
|
|
"movd -4(%%esi), %%mm4 \n\t"
|
|
"movd 0(%%esi), %%mm5 \n\t"
|
|
"movd 4(%%esi), %%mm6 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm5 \n\t"
|
|
"punpcklbw %%mm0, %%mm6 \n\t"
|
|
// free some registers
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMm) "), %%mm5 \n\t"
|
|
// prepare lower convolution row
|
|
"movd -4(%%esi, %%eax, 4), %%mm3 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm4 \n\t"
|
|
"movd 4(%%esi, %%eax, 4), %%mm7 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm7 \n\t"
|
|
// calc weightened value
|
|
"paddw %%mm6, %%mm2 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm2 \n\t"
|
|
"paddw %%mm7, %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmMe) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm1 \n\t"
|
|
"paddw %%mm5, %%mm2 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
// calc and store wightened value
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
"movd %%mm2, (%%edi, %%edx, 4) \n\t"
|
|
// advance to next pixel
|
|
"addl $4, %%esi \n\t"
|
|
"addl $4, %%edi \n\t"
|
|
"addl $4, %%ebx \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz 2b \n\t" // pixLoop
|
|
|
|
// process right edge pixel
|
|
"movd -4(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 0(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd -4(%%esi), %%mm3 \n\t"
|
|
"movd 0(%%esi), %%mm4 \n\t"
|
|
"movd -4(%%esi, %%eax, 4), %%mm5 \n\t"
|
|
"movd 0(%%esi, %%eax, 4), %%mm6 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm5 \n\t"
|
|
"punpcklbw %%mm0, %%mm6 \n\t"
|
|
"paddw %%mm5, %%mm1 \n\t"
|
|
"paddw %%mm6, %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMe) "), %%mm3 \n\t"
|
|
"pmullw (" ASMSYM(mmEm) "), %%mm4 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, " ASMSYM(aulRows) "(%%ebx) \n\t"
|
|
"movd %%mm2, 0(%%edi, %%edx, 4) \n\t"
|
|
|
|
// advance to next row
|
|
"addl (" ASMSYM(FB_slModulo1) "), %%esi \n\t" // slModulo1
|
|
"addl (" ASMSYM(FB_slModulo1) "), %%edi \n\t" // slModulo1
|
|
"popl %%ebx \n\t"
|
|
"decl %%ebx \n\t"
|
|
"jnz 1b \n\t" // rowLoop
|
|
|
|
// ----------------------- process lower left corner
|
|
|
|
"xorl %%ebx, %%ebx \n\t"
|
|
"movd 0(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 4(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd 0(%%esi), %%mm3 \n\t"
|
|
"movd 4(%%esi), %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmCm) "), %%mm3 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, (%%edi) \n\t"
|
|
"movd %%mm2, 0(%%edi, %%edx, 4) \n\t"
|
|
"add $4, %%esi \n\t"
|
|
"add $4, %%edi \n\t"
|
|
"add $4, %%ebx \n\t"
|
|
|
|
// ----------------------- process lower edge pixels
|
|
|
|
"movl (" ASMSYM(FB_pixWidth) "), %%ecx \n\t" // pixWidth
|
|
"subl $2, %%ecx \n\t"
|
|
// for each pixel
|
|
"3: \n\t" // lowerLoop
|
|
"movd -4(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 0(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd 4(%%esi, %%edx, 4), %%mm3 \n\t"
|
|
"movd -4(%%esi), %%mm4 \n\t"
|
|
"movd 0(%%esi), %%mm5 \n\t"
|
|
"movd 4(%%esi), %%mm6 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm5 \n\t"
|
|
"punpcklbw %%mm0, %%mm6 \n\t"
|
|
"paddw %%mm3, %%mm1 \n\t"
|
|
"paddw %%mm6, %%mm4 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmMe) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm4 \n\t"
|
|
"pmullw (" ASMSYM(mmEm) "), %%mm5 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddw %%mm5, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, (%%edi) \n\t"
|
|
"movd %%mm2, 0(%%edi, %%edx, 4) \n\t"
|
|
// advance to next pixel
|
|
"addl $4, %%esi \n\t"
|
|
"addl $4, %%edi \n\t"
|
|
"addl $4, %%ebx \n\t"
|
|
"decl %%ecx \n\t"
|
|
"jnz 3b \n\t" // lowerLoop
|
|
|
|
// ----------------------- lower right corners
|
|
|
|
"movd -4(%%esi, %%edx, 4), %%mm1 \n\t"
|
|
"movd 0(%%esi, %%edx, 4), %%mm2 \n\t"
|
|
"movd -4(%%esi), %%mm3 \n\t"
|
|
"movd 0(%%esi), %%mm4 \n\t"
|
|
"punpcklbw %%mm0, %%mm1 \n\t"
|
|
"punpcklbw %%mm0, %%mm2 \n\t"
|
|
"punpcklbw %%mm0, %%mm3 \n\t"
|
|
"punpcklbw %%mm0, %%mm4 \n\t"
|
|
"paddw %%mm3, %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmMc) "), %%mm1 \n\t"
|
|
"pmullw (" ASMSYM(mmEch) "), %%mm2 \n\t"
|
|
"pmullw (" ASMSYM(mmCm) "), %%mm4 \n\t"
|
|
"paddw %%mm2, %%mm1 \n\t"
|
|
"paddw %%mm4, %%mm1 \n\t"
|
|
"paddsw (" ASMSYM(mmAdd) "), %%mm1 \n\t"
|
|
"pmulhw (" ASMSYM(mmInvDiv) "), %%mm1 \n\t"
|
|
"packuswb %%mm0, %%mm1 \n\t"
|
|
"movd " ASMSYM(aulRows) "(%%ebx), %%mm2 \n\t"
|
|
"movd %%mm1, (%%edi) \n\t"
|
|
"movd %%mm2, 0(%%edi, %%edx, 4) \n\t"
|
|
"emms \n\t"
|
|
"popl %%ebx \n\t"
|
|
: // no outputs.
|
|
: // inputs are all globals.
|
|
: "eax", "ecx", "edx", "edi", "esi", "cc", "memory"
|
|
);
|
|
|
|
#else
|
|
#error Write inline asm for your platform.
|
|
#endif
|
|
|
|
// all done (finally)
|
|
_pfGfxProfile.StopTimer( CGfxProfile::PTI_FILTERBITMAP);
|
|
}
|
|
|
|
|
|
|
|
// saturate color of bitmap
|
|
void AdjustBitmapColor( ULONG *pulSrc, ULONG *pulDst, PIX pixWidth, PIX pixHeight,
|
|
SLONG const slHueShift, SLONG const slSaturation)
|
|
{
|
|
for( INDEX i=0; i<(pixWidth*pixHeight); i++) {
|
|
pulDst[i] = ByteSwap( AdjustColor( ByteSwap(pulSrc[i]), slHueShift, slSaturation));
|
|
}
|
|
}
|
|
|
|
|
|
// create mip-map table for texture or shadow of given dimensions
|
|
void MakeMipmapTable( PIX pixU, PIX pixV, MipmapTable &mmt)
|
|
{
|
|
mmt.mmt_pixU = pixU;
|
|
mmt.mmt_pixV = pixV;
|
|
// start at first mip map
|
|
PIX pixCurrentU = mmt.mmt_pixU;
|
|
PIX pixCurrentV = mmt.mmt_pixV;
|
|
INDEX iMipmapCurrent = 0;
|
|
SLONG slOffsetCurrent = 0;
|
|
// while the mip-map is not zero-sized
|
|
while (pixCurrentU>0 && pixCurrentV>0) {
|
|
// remember its offset
|
|
mmt.mmt_aslOffsets[iMipmapCurrent] = slOffsetCurrent;
|
|
// go to next mip map
|
|
slOffsetCurrent+=pixCurrentU*pixCurrentV;
|
|
iMipmapCurrent++;
|
|
pixCurrentU>>=1;
|
|
pixCurrentV>>=1;
|
|
}
|
|
// remember number of mip maps and total size
|
|
mmt.mmt_ctMipmaps = iMipmapCurrent;
|
|
mmt.mmt_slTotalSize = slOffsetCurrent;
|
|
}
|
|
|
|
|
|
|
|
// TRIANGLE MASK RENDERING (FOR MODEL CLUSTER SHADOWS) ROUTINES
|
|
|
|
static ULONG *_pulTexture;
|
|
static PIX _pixTexWidth, _pixTexHeight;
|
|
BOOL _bSomeDarkExists = FALSE;
|
|
|
|
|
|
// set texture that will be used for all subsequent triangles
|
|
void SetTriangleTexture( ULONG *pulCurrentMipmap, PIX pixMipWidth, PIX pixMipHeight)
|
|
{
|
|
_pulTexture = pulCurrentMipmap;
|
|
_pixTexWidth = pixMipWidth;
|
|
_pixTexHeight = pixMipHeight;
|
|
}
|
|
|
|
// render one triangle to mask plane for shadow casting purposes
|
|
void DrawTriangle_Mask( UBYTE *pubMaskPlane, SLONG slMaskWidth, SLONG slMaskHeight,
|
|
struct PolyVertex2D *ppv2Vtx1, struct PolyVertex2D *ppv2Vtx2,
|
|
struct PolyVertex2D *ppv2Vtx3, BOOL bTransparency)
|
|
{
|
|
struct PolyVertex2D *pUpper = ppv2Vtx1;
|
|
struct PolyVertex2D *pMiddle = ppv2Vtx2;
|
|
struct PolyVertex2D *pLower = ppv2Vtx3;
|
|
struct PolyVertex2D *pTmp;
|
|
|
|
// sort vertices by J position
|
|
if( pUpper->pv2_fJ > pMiddle->pv2_fJ) {
|
|
pTmp = pUpper; pUpper = pMiddle; pMiddle = pTmp;
|
|
}
|
|
if( pUpper->pv2_fJ > pLower->pv2_fJ) {
|
|
pTmp = pUpper; pUpper = pLower; pLower = pTmp;
|
|
}
|
|
if( pMiddle->pv2_fJ > pLower->pv2_fJ) {
|
|
pTmp = pMiddle; pMiddle = pLower; pLower = pTmp;
|
|
}
|
|
|
|
// determine vertical deltas
|
|
FLOAT fDJShort1 = pMiddle->pv2_fJ - pUpper->pv2_fJ;
|
|
FLOAT fDJShort2 = pLower->pv2_fJ - pMiddle->pv2_fJ;
|
|
FLOAT fDJLong = pLower->pv2_fJ - pUpper->pv2_fJ;
|
|
if( fDJLong == 0) return;
|
|
|
|
// determine horizontal deltas
|
|
FLOAT fDIShort1 = pMiddle->pv2_fI - pUpper->pv2_fI;
|
|
FLOAT fDIShort2 = pLower->pv2_fI - pMiddle->pv2_fI;
|
|
FLOAT fDILong = pLower->pv2_fI - pUpper->pv2_fI;
|
|
|
|
// determine U/K, V/K and 1/K deltas
|
|
FLOAT fD1oKShort1 = pMiddle->pv2_f1oK - pUpper->pv2_f1oK;
|
|
FLOAT fD1oKShort2 = pLower->pv2_f1oK - pMiddle->pv2_f1oK;
|
|
FLOAT fD1oKLong = pLower->pv2_f1oK - pUpper->pv2_f1oK;
|
|
FLOAT fDUoKShort1 = pMiddle->pv2_fUoK - pUpper->pv2_fUoK;
|
|
FLOAT fDUoKShort2 = pLower->pv2_fUoK - pMiddle->pv2_fUoK;
|
|
FLOAT fDUoKLong = pLower->pv2_fUoK - pUpper->pv2_fUoK;
|
|
FLOAT fDVoKShort1 = pMiddle->pv2_fVoK - pUpper->pv2_fVoK;
|
|
FLOAT fDVoKShort2 = pLower->pv2_fVoK - pMiddle->pv2_fVoK;
|
|
FLOAT fDVoKLong = pLower->pv2_fVoK - pUpper->pv2_fVoK;
|
|
|
|
// determine stepping factors;
|
|
FLOAT f1oDJShort1, f1oDJShort2, f1oDJLong;
|
|
if( fDJShort1 != 0) f1oDJShort1 = 1 / fDJShort1; else f1oDJShort1 = 0;
|
|
if( fDJShort2 != 0) f1oDJShort2 = 1 / fDJShort2; else f1oDJShort2 = 0;
|
|
if( fDJLong != 0) f1oDJLong = 1 / fDJLong; else f1oDJLong = 0;
|
|
|
|
FLOAT fDIoDJShort1 = fDIShort1 * f1oDJShort1;
|
|
FLOAT fDIoDJShort2 = fDIShort2 * f1oDJShort2;
|
|
FLOAT fDIoDJLong = fDILong * f1oDJLong;
|
|
FLOAT fMaxWidth = fDIoDJLong*fDJShort1 + pUpper->pv2_fI - pMiddle->pv2_fI;
|
|
|
|
// determine drawing direction and factors by direction
|
|
SLONG direction = +1;
|
|
if( fMaxWidth > 0) direction = -1;
|
|
|
|
// find start and end values for J
|
|
PIX pixUpJ = FloatToInt(pUpper->pv2_fJ +0.5f);
|
|
PIX pixMdJ = FloatToInt(pMiddle->pv2_fJ +0.5f);
|
|
PIX pixDnJ = FloatToInt(pLower->pv2_fJ +0.5f);
|
|
|
|
// clip vertically
|
|
if( pixDnJ<0 || pixUpJ>=slMaskHeight) return;
|
|
if( pixUpJ<0) pixUpJ=0;
|
|
if( pixDnJ>slMaskHeight) pixDnJ=slMaskHeight;
|
|
if( pixMdJ<0) pixMdJ=0;
|
|
if( pixMdJ>slMaskHeight) pixMdJ=slMaskHeight;
|
|
SLONG fixWidth = slMaskWidth<<11;
|
|
|
|
// find prestepped I
|
|
FLOAT fPrestepUp = (FLOAT)pixUpJ - pUpper->pv2_fJ;
|
|
FLOAT fPrestepMd = (FLOAT)pixMdJ - pMiddle->pv2_fJ;
|
|
SLONG fixILong = FloatToInt((pUpper->pv2_fI + fPrestepUp * fDIoDJLong )*2048.0f) +fixWidth*pixUpJ;
|
|
SLONG fixIShort1 = FloatToInt((pUpper->pv2_fI + fPrestepUp * fDIoDJShort1)*2048.0f) +fixWidth*pixUpJ;
|
|
SLONG fixIShort2 = FloatToInt((pMiddle->pv2_fI + fPrestepMd * fDIoDJShort2)*2048.0f) +fixWidth*pixMdJ;
|
|
|
|
// convert steps from floats to fixints (21:11)
|
|
SLONG fixDIoDJLong = FloatToInt(fDIoDJLong *2048.0f) +fixWidth;
|
|
SLONG fixDIoDJShort1 = FloatToInt(fDIoDJShort1*2048.0f) +fixWidth;
|
|
SLONG fixDIoDJShort2 = FloatToInt(fDIoDJShort2*2048.0f) +fixWidth;
|
|
|
|
// find row counter and max delta J
|
|
SLONG ctJShort1 = pixMdJ - pixUpJ;
|
|
SLONG ctJShort2 = pixDnJ - pixMdJ;
|
|
SLONG ctJLong = pixDnJ - pixUpJ;
|
|
|
|
FLOAT currK, curr1oK, currUoK, currVoK;
|
|
PIX pixJ = pixUpJ;
|
|
|
|
// if model has texture and texture has alpha channel, do complex mapping thru texture's alpha channel
|
|
if( _pulTexture!=NULL && bTransparency)
|
|
{
|
|
// calculate some texture variables
|
|
FLOAT fD1oKoDJShort1 = fD1oKShort1 * f1oDJShort1;
|
|
FLOAT fD1oKoDJShort2 = fD1oKShort2 * f1oDJShort2;
|
|
FLOAT fD1oKoDJLong = fD1oKLong * f1oDJLong;
|
|
FLOAT fDUoKoDJShort1 = fDUoKShort1 * f1oDJShort1;
|
|
FLOAT fDUoKoDJShort2 = fDUoKShort2 * f1oDJShort2;
|
|
FLOAT fDUoKoDJLong = fDUoKLong * f1oDJLong;
|
|
FLOAT fDVoKoDJShort1 = fDVoKShort1 * f1oDJShort1;
|
|
FLOAT fDVoKoDJShort2 = fDVoKShort2 * f1oDJShort2;
|
|
FLOAT fDVoKoDJLong = fDVoKLong * f1oDJLong;
|
|
;// FactOverDI = (DFoDJ * (J2-J1) + fact1 - fact2) * 1/width
|
|
FLOAT f1oMaxWidth = 1 / fMaxWidth;
|
|
FLOAT fD1oKoDI = (fD1oKoDJLong * fDJShort1 + pUpper->pv2_f1oK - pMiddle->pv2_f1oK) * f1oMaxWidth;
|
|
FLOAT fDUoKoDI = (fDUoKoDJLong * fDJShort1 + pUpper->pv2_fUoK - pMiddle->pv2_fUoK) * f1oMaxWidth;
|
|
FLOAT fDVoKoDI = (fDVoKoDJLong * fDJShort1 + pUpper->pv2_fVoK - pMiddle->pv2_fVoK) * f1oMaxWidth;
|
|
if( direction == -1) {
|
|
fD1oKoDI = -fD1oKoDI;
|
|
fDUoKoDI = -fDUoKoDI;
|
|
fDVoKoDI = -fDVoKoDI;
|
|
}
|
|
// find prestepped U/K, V/K, 1/K
|
|
FLOAT f1oKLong = pUpper->pv2_f1oK + fPrestepUp * fD1oKoDJLong;
|
|
FLOAT f1oKShort1 = pUpper->pv2_f1oK + fPrestepUp * fD1oKoDJShort1;
|
|
FLOAT f1oKShort2 = pMiddle->pv2_f1oK + fPrestepMd * fD1oKoDJShort2;
|
|
FLOAT fUoKLong = pUpper->pv2_fUoK + fPrestepUp * fDUoKoDJLong;
|
|
FLOAT fUoKShort1 = pUpper->pv2_fUoK + fPrestepUp * fDUoKoDJShort1;
|
|
FLOAT fUoKShort2 = pMiddle->pv2_fUoK + fPrestepMd * fDUoKoDJShort2;
|
|
FLOAT fVoKLong = pUpper->pv2_fVoK + fPrestepUp * fDVoKoDJLong;
|
|
FLOAT fVoKShort1 = pUpper->pv2_fVoK + fPrestepUp * fDVoKoDJShort1;
|
|
FLOAT fVoKShort2 = pMiddle->pv2_fVoK + fPrestepMd * fDVoKoDJShort2;
|
|
|
|
// render upper triangle part
|
|
PIX pixTexU, pixTexV;
|
|
while( ctJShort1>0) {
|
|
SLONG currI = fixILong>>11;
|
|
SLONG countI = abs( currI - (fixIShort1>>11));
|
|
if( countI==0) goto nextLine1;
|
|
curr1oK = f1oKLong;
|
|
currUoK = fUoKLong;
|
|
currVoK = fVoKLong;
|
|
if( direction == -1) currI--;
|
|
if( countI>0) _bSomeDarkExists = TRUE;
|
|
while( countI>0) {
|
|
currK = 1.0f/curr1oK;
|
|
pixTexU = (FloatToInt(currUoK*currK)) & (_pixTexWidth -1);
|
|
pixTexV = (FloatToInt(currVoK*currK)) & (_pixTexHeight-1);
|
|
if( _pulTexture[pixTexV*_pixTexWidth+pixTexU] & ((CT_rAMASK<<7)&CT_rAMASK)) pubMaskPlane[currI] = 0;
|
|
curr1oK += fD1oKoDI;
|
|
currUoK += fDUoKoDI;
|
|
currVoK += fDVoKoDI;
|
|
currI += direction;
|
|
countI--;
|
|
}
|
|
nextLine1:
|
|
pixJ++;
|
|
f1oKLong += fD1oKoDJLong;
|
|
f1oKShort1 += fD1oKoDJShort1;
|
|
fUoKLong += fDUoKoDJLong;
|
|
fUoKShort1 += fDUoKoDJShort1;
|
|
fVoKLong += fDVoKoDJLong;
|
|
fVoKShort1 += fDVoKoDJShort1;
|
|
fixILong += fixDIoDJLong;
|
|
fixIShort1 += fixDIoDJShort1;
|
|
ctJShort1--;
|
|
}
|
|
|
|
// render lower triangle part
|
|
while( ctJShort2>0) {
|
|
SLONG currI = fixILong>>11;
|
|
SLONG countI = abs( currI - (fixIShort2>>11));
|
|
if( countI==0) goto nextLine2;
|
|
curr1oK = f1oKLong;
|
|
currUoK = fUoKLong;
|
|
currVoK = fVoKLong;
|
|
if( direction == -1) currI--;
|
|
if( countI>0) _bSomeDarkExists = TRUE;
|
|
while( countI>0) {
|
|
currK = 1.0f/curr1oK;
|
|
pixTexU = (FloatToInt(currUoK*currK)) & (_pixTexWidth -1);
|
|
pixTexV = (FloatToInt(currVoK*currK)) & (_pixTexHeight-1);
|
|
if( _pulTexture[pixTexV*_pixTexWidth+pixTexU] & CT_rAMASK) pubMaskPlane[currI] = 0;
|
|
curr1oK += fD1oKoDI;
|
|
currUoK += fDUoKoDI;
|
|
currVoK += fDVoKoDI;
|
|
currI += direction;
|
|
countI--;
|
|
}
|
|
nextLine2:
|
|
pixJ++;
|
|
f1oKLong += fD1oKoDJLong;
|
|
f1oKShort2 += fD1oKoDJShort2;
|
|
fUoKLong += fDUoKoDJLong;
|
|
fUoKShort2 += fDUoKoDJShort2;
|
|
fVoKLong += fDVoKoDJLong;
|
|
fVoKShort2 += fDVoKoDJShort2;
|
|
fixILong += fixDIoDJLong;
|
|
fixIShort2 += fixDIoDJShort2;
|
|
ctJShort2--;
|
|
}
|
|
}
|
|
// simple flat mapping (no texture at all)
|
|
else
|
|
{
|
|
// render upper triangle part
|
|
while( ctJShort1>0) {
|
|
SLONG currI = fixILong>>11;
|
|
SLONG countI = abs( currI - (fixIShort1>>11));
|
|
if( direction == -1) currI--;
|
|
if( countI>0) _bSomeDarkExists = TRUE;
|
|
while( countI>0) {
|
|
pubMaskPlane[currI] = 0;
|
|
currI += direction;
|
|
countI--;
|
|
}
|
|
pixJ++;
|
|
fixILong += fixDIoDJLong;
|
|
fixIShort1 += fixDIoDJShort1;
|
|
ctJShort1--;
|
|
}
|
|
// render lower triangle part
|
|
while( ctJShort2>0) {
|
|
SLONG currI = fixILong>>11;
|
|
SLONG countI = abs( currI - (fixIShort2>>11));
|
|
if( countI>0) _bSomeDarkExists = TRUE;
|
|
if( direction == -1) currI--;
|
|
while( countI>0) {
|
|
pubMaskPlane[currI] = 0;
|
|
currI += direction;
|
|
countI--;
|
|
}
|
|
pixJ++;
|
|
fixILong += fixDIoDJLong;
|
|
fixIShort2 += fixDIoDJShort2;
|
|
ctJShort2--;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
// ---------------------------------------------------------------------------------------------
|
|
|
|
|
|
#if 0
|
|
|
|
// bilinear filtering of lower mipmap
|
|
|
|
// row loop
|
|
UBYTE r,g,b,a;
|
|
for( PIX v=0; v<pixHeight; v++)
|
|
{ // column loop
|
|
for( PIX u=0; u<pixWidth; u++)
|
|
{ // read four neighbour pixels
|
|
COLOR colUL = pulSrcMipmap[((v*2+0)*pixCurrWidth*2+u*2) +0];
|
|
COLOR colUR = pulSrcMipmap[((v*2+0)*pixCurrWidth*2+u*2) +1];
|
|
COLOR colDL = pulSrcMipmap[((v*2+1)*pixCurrWidth*2+u*2) +0];
|
|
COLOR colDR = pulSrcMipmap[((v*2+1)*pixCurrWidth*2+u*2) +1];
|
|
// separate and add channels
|
|
ULONG rRes=0, gRes=0, bRes=0, aRes=0;
|
|
ColorToRGBA( colUL, r,g,b,a); rRes += r; gRes += g; bRes += b; aRes += a;
|
|
ColorToRGBA( colUR, r,g,b,a); rRes += r; gRes += g; bRes += b; aRes += a;
|
|
ColorToRGBA( colDL, r,g,b,a); rRes += r; gRes += g; bRes += b; aRes += a;
|
|
ColorToRGBA( colDR, r,g,b,a); rRes += r; gRes += g; bRes += b; aRes += a;
|
|
// round, average and store
|
|
rRes += 2; gRes += 2; bRes += 2; aRes += 2;
|
|
rRes >>=2; gRes >>=2; bRes >>=2; aRes >>=2;
|
|
pulDstMipmap[v*pixCurrWidth+u] = RGBAToColor( rRes,gRes,bRes,aRes);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// nearest-neighbouring of lower mipmap (with border preservance)
|
|
|
|
// row loop
|
|
PIX u,v;
|
|
for( v=0; v<pixCurrHeight/2; v++) {
|
|
for( u=0; u<pixCurrWidth/2; u++) { // mipmap upper left pixel
|
|
pulDstMipmap[v*pixCurrWidth+u] = pulSrcMipmap[((v*2+0)*pixCurrWidth*2+u*2) +0];
|
|
}
|
|
for( u=pixCurrWidth/2; u<pixCurrWidth; u++) { // mipmap upper right pixel
|
|
pulDstMipmap[v*pixCurrWidth+u] = pulSrcMipmap[((v*2+0)*pixCurrWidth*2+u*2) +1];
|
|
}
|
|
}
|
|
for( v=pixCurrHeight/2; v<pixCurrHeight; v++) {
|
|
for( u=0; u<pixCurrWidth/2; u++) { // mipmap upper left pixel
|
|
pulDstMipmap[v*pixCurrWidth+u] = pulSrcMipmap[((v*2+1)*pixCurrWidth*2+u*2) +0];
|
|
}
|
|
for( u=pixCurrWidth/2; u<pixCurrWidth; u++) { // mipmap upper right pixel
|
|
pulDstMipmap[v*pixCurrWidth+u] = pulSrcMipmap[((v*2+1)*pixCurrWidth*2+u*2) +1];
|
|
}
|
|
}
|
|
|
|
|
|
|
|
// left to right error diffusion dithering
|
|
|
|
__asm {
|
|
pxor mm0,mm0
|
|
mov esi,D [pulDst]
|
|
mov ebx,D [pixCanvasWidth]
|
|
mov edx,D [pixHeight]
|
|
dec edx // need not to dither last row
|
|
rowLoopE:
|
|
mov ecx,D [pixWidth]
|
|
dec ecx
|
|
pixLoopE:
|
|
movd mm1,D [esi]
|
|
punpcklbw mm1,mm0
|
|
pand mm1,Q [mmErrDiffMask]
|
|
// determine errors
|
|
movq mm3,mm1
|
|
paddw mm3,mm3 // *2
|
|
movq mm5,mm3
|
|
paddw mm5,mm5 // *4
|
|
movq mm7,mm5
|
|
paddw mm7,mm7 // *8
|
|
paddw mm3,mm1 // *3
|
|
paddw mm5,mm1 // *5
|
|
psubw mm7,mm1 // *7
|
|
psrlw mm1,4
|
|
psrlw mm3,4
|
|
psrlw mm5,4
|
|
psrlw mm7,4
|
|
packuswb mm1,mm0
|
|
packuswb mm3,mm0
|
|
packuswb mm5,mm0
|
|
packuswb mm7,mm0
|
|
// spread errors
|
|
movd mm2,D [esi+ ebx*4 +4]
|
|
paddusb mm1,mm2
|
|
paddusb mm3,Q [esi+ ebx*4 -4]
|
|
paddusb mm5,Q [esi+ ebx*4 +0]
|
|
paddusb mm7,Q [esi+ +4]
|
|
movd D [esi+ ebx*4 +4],mm1
|
|
movd D [esi+ ebx*4 -4],mm3
|
|
movd D [esi+ ebx*4 +0],mm5
|
|
movd D [esi+ +4],mm7
|
|
// advance to next pixel
|
|
add esi,4
|
|
dec ecx
|
|
jnz pixLoopE
|
|
// advance to next row
|
|
add esi,D [slModulo]
|
|
dec edx
|
|
jnz rowLoopE
|
|
emms
|
|
}
|
|
|
|
|
|
// left to right and right to left error diffusion dithering
|
|
|
|
__asm {
|
|
pxor mm0,mm0
|
|
mov esi,D [pulDst]
|
|
mov ebx,D [pixCanvasWidth]
|
|
mov edx,D [pixHeight]
|
|
dec edx // need not to dither last row
|
|
rowLoopE:
|
|
// left to right
|
|
mov ecx,D [pixWidth]
|
|
dec ecx
|
|
pixLoopEL:
|
|
movd mm1,D [esi]
|
|
punpcklbw mm1,mm0
|
|
pand mm1,Q [mmErrDiffMask]
|
|
// determine errors
|
|
movq mm3,mm1
|
|
paddw mm3,mm3 // *2
|
|
movq mm5,mm3
|
|
paddw mm5,mm5 // *4
|
|
movq mm7,mm5
|
|
paddw mm7,mm7 // *8
|
|
paddw mm3,mm1 // *3
|
|
paddw mm5,mm1 // *5
|
|
psubw mm7,mm1 // *7
|
|
psrlw mm1,4
|
|
psrlw mm3,4
|
|
psrlw mm5,4
|
|
psrlw mm7,4
|
|
packuswb mm1,mm0
|
|
packuswb mm3,mm0
|
|
packuswb mm5,mm0
|
|
packuswb mm7,mm0
|
|
// spread errors
|
|
movd mm2,D [esi+ ebx*4 +4]
|
|
paddusb mm1,mm2
|
|
paddusb mm3,Q [esi+ ebx*4 -4]
|
|
paddusb mm5,Q [esi+ ebx*4 +0]
|
|
paddusb mm7,Q [esi+ +4]
|
|
movd D [esi+ ebx*4 +4],mm1
|
|
movd D [esi+ ebx*4 -4],mm3
|
|
movd D [esi+ ebx*4 +0],mm5
|
|
movd D [esi+ +4],mm7
|
|
// advance to next pixel
|
|
add esi,4
|
|
dec ecx
|
|
jnz pixLoopEL
|
|
// advance to next row
|
|
add esi,D [slWidthModulo]
|
|
dec edx
|
|
jz allDoneE
|
|
|
|
// right to left
|
|
mov ecx,D [pixWidth]
|
|
dec ecx
|
|
pixLoopER:
|
|
movd mm1,D [esi]
|
|
punpcklbw mm1,mm0
|
|
pand mm1,Q [mmErrDiffMask]
|
|
// determine errors
|
|
movq mm3,mm1
|
|
paddw mm3,mm3 // *2
|
|
movq mm5,mm3
|
|
paddw mm5,mm5 // *4
|
|
movq mm7,mm5
|
|
paddw mm7,mm7 // *8
|
|
paddw mm3,mm1 // *3
|
|
paddw mm5,mm1 // *5
|
|
psubw mm7,mm1 // *7
|
|
psrlw mm1,4
|
|
psrlw mm3,4
|
|
psrlw mm5,4
|
|
psrlw mm7,4
|
|
packuswb mm1,mm0
|
|
packuswb mm3,mm0
|
|
packuswb mm5,mm0
|
|
packuswb mm7,mm0
|
|
// spread errors
|
|
paddusb mm1,Q [esi+ ebx*4 -4]
|
|
paddusb mm3,Q [esi+ ebx*4 +4]
|
|
paddusb mm5,Q [esi+ ebx*4 +0]
|
|
paddusb mm7,Q [esi+ -4]
|
|
movd D [esi+ ebx*4 -4],mm1
|
|
movd D [esi+ ebx*4 +4],mm3
|
|
movd D [esi+ ebx*4 +0],mm5
|
|
movd D [esi+ -4],mm7
|
|
// revert to previous pixel
|
|
sub esi,4
|
|
dec ecx
|
|
jnz pixLoopER
|
|
// advance to next row
|
|
add esi,D [slCanvasWidth]
|
|
dec edx
|
|
jnz rowLoopE
|
|
allDoneE:
|
|
emms
|
|
}
|
|
|
|
|
|
|
|
// bicubic
|
|
|
|
static INDEX aiWeights[4][4] = {
|
|
{ -1, 9, 9, -1, },
|
|
{ 9, 47, 47, 9, },
|
|
{ 9, 47, 47, 9, },
|
|
{ -1, 9, 9, -1 }
|
|
};
|
|
|
|
|
|
const SLONG slMaskU=pixWidth *2 -1;
|
|
const SLONG slMaskV=pixHeight*2 -1;
|
|
|
|
// bicubic?
|
|
if( pixWidth>4 && pixHeight>4 /*&& tex_bBicubicMipmaps*/)
|
|
{
|
|
for( INDEX j=0; j<pixHeight; j++) {
|
|
for( INDEX i=0; i<pixWidth; i++) {
|
|
COLOR col;
|
|
UBYTE ubR, ubG, ubB, ubA;
|
|
SLONG slR=0, slG=0, slB=0, slA=0;
|
|
for( INDEX v=0; v<4; v++) {
|
|
const INDEX iRowSrc = ((v-1)+j*2) & slMaskV;
|
|
for( INDEX u=0; u<4; u++) {
|
|
const INDEX iColSrc = ((u-1)+i*2) & slMaskU;
|
|
const INDEX iWeight = aiWeights[u][v];
|
|
col = ByteSwap( pulSrcMipmap[iRowSrc*(slMaskU+1)+iColSrc]);
|
|
ColorToRGBA( col, ubR,ubG,ubB,ubA);
|
|
slR += ubR*iWeight;
|
|
slG += ubG*iWeight;
|
|
slB += ubB*iWeight;
|
|
slA += ubA*iWeight;
|
|
}
|
|
}
|
|
col = RGBAToColor( slR>>8, slG>>8, slB>>8, slA>>8);
|
|
pulDstMipmap[j*pixWidth+i] = ByteSwap(col);
|
|
}
|
|
}
|
|
}
|
|
// bilinear!
|
|
else
|
|
{
|
|
|
|
|
|
}
|
|
|
|
#endif
|