モノクロ(SIMD版・toMono)

 

pgm.c

//

// pgm.c

//

// (c)Copyright Spacesoft corp., 2007 All rights reserved.

//                               Hiro KITAYAMA

//

#include <stdio.h>

#include <conio.h>                  // kbhit(), getch()

#include <stdlib.h>                 // _splitpath()

#include <windows.h>

 

#define SAFE_FREE( p )      if( p ) { free( p ) ; p=NULL ; }

#define SZCLASSNAME         "TEST"

 

// グローバル変数

PBITMAPINFOHEADER   pDib ;      // tolal

LPBYTE              pBitmap ;   // data

 

void effect( LPBYTE pSrc, LPBYTE pDst, LONG width, LONG height );

 

 

//---------------------------------------------------------------------------

//

// ファイル名の取得

//

BOOL getFname( char* inFname )

{

    OPENFILENAME fName ;

    char fileName[256] ;

    const char filefilter[] = "BMPファイル(*.bmp)\0*.bmp\0\0" ;

 

    fileName[0] = '\0' ;

 

    memset( &fName, 0, sizeof(OPENFILENAME) ) ;

    fName.lStructSize    = sizeof(OPENFILENAME) ;

    fName.lpstrFilter    = filefilter ;

    fName.nFilterIndex   = 1 ;

    fName.lpstrFile      = fileName ;

    fName.nMaxFile       = sizeof(fileName) ;

    fName.Flags          = OFN_FILEMUSTEXIST | OFN_HIDEREADONLY ;

 

    // 「ファイルを開く」ダイアログ

    if( GetOpenFileName( &fName ) == 0 )

        return FALSE ;

 

    strcpy( inFname, fileName ) ;

 

    return TRUE ;

}

 

 

//---------------------------------------------------------------------------

//

// ビットッマプヘッダの読み込み

//

BOOL readHeader( FILE* fp, PBITMAPFILEHEADER bmHdr )

{

    // ビットッマプヘッダの読み込み

    if( fread( bmHdr, sizeof(BITMAPFILEHEADER), 1, fp ) !=  1 )

    {

        fprintf( stderr, "エラー:ファイル読み込み.\n" ) ;

        return FALSE ;

    }

 

    // ビットッマプファイルかチェック

    if ( bmHdr -> bfType != 'M'*256+'B' )

    {

        fprintf( stderr, "エラー:BMPフォーマットでは無い.\n" );

        return FALSE ;

    }

    return TRUE ;

}

 

 

//---------------------------------------------------------------------------

//

// ビットッマプ本体の読み込み

//

BOOL readBody( FILE* fp, PBITMAPFILEHEADER bmHdr, PBITMAPINFOHEADER pDib )

{

    int bitmapSize ;

 

    bitmapSize = bmHdr -> bfSize - sizeof(BITMAPFILEHEADER) ;   // 画像の大きさ

 

    // ビットッマプ本体の読み込み

    if( fread( pDib , bitmapSize, 1, fp ) !=  1 )

    {

        fprintf( stderr, "エラー:ファイル読み込み.\n" ) ;

        return FALSE ;

    }

    if( pDib -> biBitCount != 32 )

    {

        fprintf( stderr, "エラー:32ビット ビットッマプではない.\n");

        return FALSE ;

    }

    if( pDib -> biWidth % 4 )

    {

        fprintf( stderr, "エラー:幅が4 バイトの整数倍でない.\n");

        return FALSE ;

    }

    return TRUE ;

}

 

 

//---------------------------------------------------------------------------

//

// window procedure

//

LRESULT CALLBACK WindProc( HWND hWnd, UINT uMessage, WPARAM wParam, LPARAM lParam )

{

    PAINTSTRUCT ps ;

 

    switch( uMessage )

    {

        case WM_PAINT:

            BeginPaint( hWnd, &ps ) ;

            SetDIBitsToDevice( ps.hdc, 0, 0,        // copy BMP

                                pDib -> biWidth, pDib -> biHeight,

                                    0, 0, 0, pDib -> biHeight, pBitmap,

                                        (BITMAPINFO*)pDib, DIB_RGB_COLORS ) ;

            EndPaint( hWnd, &ps ) ;

            break ;

 

        case WM_DESTROY:

            PostQuitMessage( 0 ) ;

            break ;

 

        default:

            return DefWindowProc( hWnd, uMessage, wParam, lParam ) ;

    }

    return 0 ;

}

 

 

//---------------------------------------------------------------------------

//

// create window

//

HWND createWindow( PBITMAPINFOHEADER pDib )

{

    HINSTANCE hInstanse ;

    WNDCLASSEX  wcx ;

    HWND hWnd = NULL ;

    int  ttlHeight, frameWidth, frameHeight ;

    int  windowWidth, windowHeight ;

 

    hInstanse = (HINSTANCE)GetWindowLong( NULL, GWL_HINSTANCE ) ;

 

    memset( &wcx, 0, sizeof(WNDCLASSEX) ) ;

    wcx.cbSize        = sizeof(WNDCLASSEX) ;

    wcx.lpfnWndProc   = WindProc ;                      // ウィンドウプロシージャ

    wcx.hInstance     = hInstanse ;

    wcx.hCursor       = LoadCursor( NULL, IDC_ARROW ) ;

    wcx.lpszClassName = SZCLASSNAME ;                   // ウィンドウクラス名

    if( !RegisterClassEx( &wcx ) )                      // ウィンドウクラスの登録

        return NULL ;                                   // 失敗

 

    ttlHeight   = GetSystemMetrics( SM_CYCAPTION ) ;    // ウィンドウタイトルの高さ

    frameWidth  = GetSystemMetrics( SM_CXFIXEDFRAME ) ; // ウィンドウフレームの幅

    frameHeight = GetSystemMetrics( SM_CYFIXEDFRAME ) ; // ウィンドウフレームの高さ

 

    windowWidth = pDib -> biWidth  + (frameWidth * 2),  // ウィンドウの幅

    windowHeight= pDib -> biHeight

                    + ttlHeight + (frameHeight * 2) ;   // ウィンドウの高さ

 

    hWnd = CreateWindow( SZCLASSNAME,                   // ウィンドウクラス名

                "表示ウィンドウ",                       // ウィンドウタイトル

                WS_OVERLAPPED | WS_SYSMENU | WS_VISIBLE,// ウィンドウスタイル

                CW_USEDEFAULT, CW_USEDEFAULT,           // ウィンドウ位置

                windowWidth,windowHeight,               // ウィンドウサイズ

                HWND_DESKTOP, NULL, hInstanse, NULL ) ;

 

    return hWnd ;

}

 

 

//---------------------------------------------------------------------------

//

// main

//

int main( int argc, char* argv[] )

{

    char  fName[_MAX_PATH], dfFile[_MAX_PATH], dfName[_MAX_FNAME], dfExt[_MAX_EXT ] ;

    BITMAPFILEHEADER bmHdr ;

    FILE*  fp ;

    HWND   hWnd ;

    MSG    msg ;

    int    bitmapSize, loop, imgSize ;

    LPBYTE pSrc, pDst ;                     // SIMD用バッファ

    UINT64 begin, finish, elapsed ;         // 性能表示

 

    if( argc < 2 )

        getFname( fName ) ;                 // 引数なし

    else

        strcpy( fName, argv[1] ) ;          // 引数あり

 

 

    // ビットマップファイルのオープン

    if( (fp = fopen( fName, "rb" )) == NULL )

    {

        fprintf( stderr, "エラー:ファイルのオープンに失敗.\n" ) ;

        return -1;

    }

 

    // ファイル名の表示

    _splitpath( fName, NULL, NULL, dfName, dfExt ) ;

    strcpy( dfFile, dfName ) ;

    strcat( dfFile, dfExt ) ;

    fprintf( stdout, "ファイル名 = [%s]\n", dfFile ) ;

 

    // ヘッダの読み込み

    if( !readHeader( fp, &bmHdr ) )

    {

        fclose( fp ) ;      // ファイルクローズ

        return -1;

    }

 

 

    // ビットマップ本体のサイズ

    bitmapSize = bmHdr.bfSize - sizeof(BITMAPFILEHEADER) ;

 

    // メモリ確保

    pDib = (BITMAPINFOHEADER *)malloc( bitmapSize ) ;

 

    // 本体読み込み

    if( !readBody( fp, &bmHdr, pDib ) )

    {

        SAFE_FREE( pDib ) ;     // メモリ解放

        fclose( fp ) ;          // ファイルクローズ

        return -1;

    }

 

    fclose( fp ) ;              // ファイルクローズ

 

 

    // 画像サイズの表示

    fprintf( stdout, "ビットマップサイズ= %d x %d\n",

                                pDib -> biWidth, pDib -> biHeight ) ;

 

    // ポインタをビットマップ本体位置へ移動

    pBitmap = (BYTE *)(pDib) + bmHdr.bfOffBits - sizeof(BITMAPFILEHEADER) ;

 

    imgSize=pDib->biWidth * pDib->biHeight * 4;

 

    // alignされたメモリを確保

    pSrc = (LPBYTE)_aligned_malloc(imgSize, 16);

    pDst = (LPBYTE)_aligned_malloc(imgSize, 16);

 

    // alignされたエリアにコピー

    memcpy(pSrc, pBitmap, imgSize);

 

    _asm

    {

        rdtsc

        mov     dword ptr [begin], eax

        mov     dword ptr [begin+4], edx

    }

 

    //画像処理

    for(loop=0;loop<100;loop++) //キャッシュなどの影響を排除するため100回実行

        effect( pSrc, pDst, pDib -> biWidth, pDib -> biHeight ) ;

 

    _asm

    {

        rdtsc

        mov     dword ptr [finish], eax

        mov     dword ptr [finish+4], edx

    }

    elapsed = finish > begin ? finish - begin : begin - finish ;

    printf( "counter= %I64u\n", elapsed ) ;

 

 

    // alignされたエリアからbitmapへコピー

    memcpy(pBitmap, pDst, imgSize);

 

    hWnd = createWindow( pDib ) ;           // ウィンドウ表示

 

    fprintf( stdout, "Enterキーを押せば終わります..." ) ;

 

    // メッセージループ

    msg.message = WM_CREATE ;               // dummy for while

    while( msg.message != WM_QUIT )

    {

        if( PeekMessage( &msg, NULL, 0, 0, PM_REMOVE ) )

        {

            TranslateMessage( &msg ) ;

            DispatchMessage( &msg ) ;

        }

 

        // Enterキーを押されたらQuit

        if( kbhit() )

        {

            PostMessage( hWnd, (UINT)WM_DESTROY, (UINT)0, (LONG)0 ) ;

            getch() ;

        }

    }

 

    SAFE_FREE( pDib ) ;         // メモリ解放

    _aligned_free(pSrc);        // メモリ解放

    _aligned_free(pDst);        // メモリ解放

 

    return 0;

}

 

effect.c

//

// effect.c

//

// (c)Copyright Spacesoft corp., 2007 All rights reserved.

//                               Hiro KITAYAMA

//

#include <windows.h>

 

void toMono32Bmp( LPBYTE pInSrc, LPBYTE pInDst, LONG width, LONG height );

 

//---------------------------------------------------------------------------

//

// エフェクト

//

void effect( LPBYTE pInSrc, LPBYTE pInDst, LONG width, LONG height )

{

    toMono32Bmp( pInSrc, pInDst, width, height );

}

 

 

 

以下のどれかを使用する.

 

toMono32Bmp.c

//

// effect.c

//

// (c)Copyright Spacesoft corp., 2007 All rights reserved.

//                               Hiro KITAYAMA

//

#include <windows.h>

 

//---------------------------------------------------------------------------

//

// 32 bit RGB to 32 bit monochrome bmp

//

void toMono32Bmp( LPBYTE pInSrc, LPBYTE pInDst, LONG width, LONG height )

{

    int loopCounter = (height*width)/4;

    float monoConst[]={0.114478f,0.586611f,0.298912f,0.000000f};

 

    _asm

    {

        xor         ebx, ebx

        mov         esi, pInSrc         // esi = src

        mov         edi, pInDst         // edi = dst

        mov         ecx, loopCounter    // loop counter

 

 

        pxor        xmm0,xmm0           // fixed zero

        movdqu      xmm7, monoConst

 

 

                                        // xmm0 = zero(integer)

                                        // xmm2 = source

                                        // xmm5 = destination

                                        // xmm7 = mono constant

    loopLbl:

        movdqa      xmm1, [esi+ebx]     // src

        movdqa      xmm2, xmm1          // save src

 

 

 

                                        // p3, p2=================================

        pshufd      xmm1, xmm2, 0eh     // p3, p2を下位へコピー,----1110b

                                        //

                                        // xmm2:   p3   p2   p1  p0

                                        // xmm1:   --   --   p3  p2

                                        //                                       p3           p2

                                        // xmm1:   -- -- -- --  -- -- -- --  aa bb gg rr  aa bb gg rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p3                   p2

                                        // xmm1:   00aa 00bb 00gg 00rr  00aa 00bb 00gg 00rr

 

        movdqa      xmm3, xmm1          //                                       p3

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00bb 00gg 00rr

 

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000bb  000000gg  000000rr, pixcel2

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000bb  000000gg  000000rr, pixcel3

 

                                        // Pixel 2==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B2  G2  R2, pixcel2(float)

 

                                        // Pixel 3==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B3  G3  R3, pixcel3(float)

 

 

                                        // pixel 3-2の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B3---  G3+R3   ---B2---  G2+R2  ,  (float)

 

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B3+G3+R3 B2+G2+R2 B3+G3+R3 B2+G2+R2,  (float)

 

        cvtps2dq    xmm5, xmm1          // float -> Dword

                                        // xmm5:   000000M3 000000M2 000000M3 000000M2,  (DWORD)

 

 

 

        movdqa      xmm1, xmm2          // p1, p0=================================

                                        // xmm1:   p3   p2   p1  p0

                                        //                                       p1           p0

                                        // xmm1:   -- -- -- --  -- -- -- --  aa gg bb rr  aa gg bb rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p1                   p0

                                        // xmm1:   00aa 00gg 00bb 00rr  00aa 00gg 00bb 00rr

                                        //

                                        //                                       p1

        movdqa      xmm3, xmm1

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00gg 00bb 00rr

                                        //

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000gg  000000bb  000000rr, pixcel0

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000gg  000000bb  000000rr, pixcel1

                                       

 

                                        // Pixel 0==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B0  G0  R0, pixcel0(float)

 

 

                                        // Pixel 1==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B1  G1  R1, pixcel1(float)

 

 

                                        // pixel 1-0の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B1---  G1+R1   ---B0---  G0+R0  ,  (float)

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B1+G1+R1 B0+G0+R0 B1+G1+R1 B0+G0+R0,  (float)

 

        cvtps2dq    xmm1, xmm1          // float -> Dword

                                        // xmm5:   000000M1 000000M0 000000M1 000000M0,  (DWORD)

 

 

        movsd       xmm5, xmm1          //  4 pixelを統合

                                        // xmm5:   000000M3 000000M2 000000M1 000000M0,  (DWORD)

 

        movdqa      xmm1, xmm5

        pslldq      xmm1, 1             // xmm1:   0000M300 0000M200 0000M100 0000M000,  (DWORD)

        por         xmm1, xmm5          // xmm1:   0000M3M3 0000M2M2 0000M1M1 0000M0M0,  (DWORD)

 

        movdqa      xmm5, xmm1          // xmm5:   0000M3M3 0000M2M2 0000M1M1 0000M0M0,  (DWORD)

        pslldq      xmm1, 1             // xmm1:   00M3M300 00M2M200 00M1M100 00M0M000,  (DWORD)

        por         xmm5, xmm1          // xmm5:   00M3M3M3 00M2M2M2 00M1M1M1 00M0M0M0,  (DWORD)

 

 

        movdqa  [edi+ebx], xmm5         // write to dest

        lea     ebx,  [ebx+16]          // next address

 

        dec     ecx

        jecxz   exit_x

        jmp     loopLbl

    exit_x:

    }

}

 

toMonoDword.c

//

// effect.c

//

// (c)Copyright Spacesoft corp., 2007 All rights reserved.

//                               Hiro KITAYAMA

//

#include <windows.h>

 

 

//---------------------------------------------------------------------------

//

// 32 bit RGB to DWRD array

//

void toMonoDword( LPBYTE pInSrc, LPBYTE pInDst, LONG width, LONG height )

{

    int loopCounter = (height*width)/4;

    float monoConst[]={0.114478f,0.586611f,0.298912f,0.000000f};

 

    _asm

    {

        xor         ebx, ebx

        mov         esi, pInSrc         // esi = src

        mov         edi, pInDst         // edi = dst

        mov         ecx, loopCounter    // loop counter

 

 

        pxor        xmm0,xmm0           // fixed zero

        movdqu      xmm7, monoConst

 

 

                                        // xmm0 = zero(integer)

                                        // xmm2 = source

                                        // xmm5 = destination

                                        // xmm7 = mono constant

    loopLbl:

        movdqa      xmm1, [esi+ebx]     // src

        movdqa      xmm2, xmm1          // save src

 

 

 

                                        // p3, p2=================================

        pshufd      xmm1, xmm2, 0eh     // p3, p2を下位へコピー,----1110b

                                        //

                                        // xmm2:   p3   p2   p1  p0

                                        // xmm1:   --   --   p3  p2

                                        //                                       p3           p2

                                        // xmm1:   -- -- -- --  -- -- -- --  aa bb gg rr  aa bb gg rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p3                   p2

                                        // xmm1:   00aa 00bb 00gg 00rr  00aa 00bb 00gg 00rr

 

        movdqa      xmm3, xmm1          //                                       p3

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00bb 00gg 00rr

 

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000bb  000000gg  000000rr, pixcel2

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000bb  000000gg  000000rr, pixcel3

 

                                        // Pixel 2==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B2  G2  R2, pixcel2(float)

 

                                        // Pixel 3==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B3  G3  R3, pixcel3(float)

 

 

                                        // pixel 3-2の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B3---  G3+R3   ---B2---  G2+R2  ,  (float)

 

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B3+G3+R3 B2+G2+R2 B3+G3+R3 B2+G2+R2,  (float)

 

        cvtps2dq    xmm5, xmm1          // float -> Dword

                                        // xmm5:   000000M3 000000M2 000000M3 000000M2,  (DWORD)

 

 

 

        movdqa      xmm1, xmm2          // p1, p0=================================

                                        // xmm1:   p3   p2   p1  p0

                                        //                                       p1           p0

                                        // xmm1:   -- -- -- --  -- -- -- --  aa gg bb rr  aa gg bb rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p1                   p0

                                        // xmm1:   00aa 00gg 00bb 00rr  00aa 00gg 00bb 00rr

                                        //

                                        //                                       p1

        movdqa      xmm3, xmm1

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00gg 00bb 00rr

                                        //

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000gg  000000bb  000000rr, pixcel0

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000gg  000000bb  000000rr, pixcel1

                                       

 

                                        // Pixel 0==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B0  G0  R0, pixcel0(float)

 

 

                                        // Pixel 1==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B1  G1  R1, pixcel1(float)

 

 

                                        // pixel 1-0の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B1---  G1+R1   ---B0---  G0+R0  ,  (float)

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B1+G1+R1 B0+G0+R0 B1+G1+R1 B0+G0+R0,  (float)

 

        cvtps2dq    xmm1, xmm1          // float -> Dword

                                        // xmm5:   000000M1 000000M0 000000M1 000000M0,  (DWORD)

 

 

        movsd       xmm5, xmm1          //  4 pixelを統合

                                        // xmm5:   000000M3 000000M2 000000M1 000000M0,  (DWORD)

 

 

        movdqa  [edi+ebx], xmm5         // write to dest

        lea     ebx,  [ebx+16]          // next address

 

        loop    loopLbl

    }

}

 

toMonoSingle.c

//

// effect.c

//

// (c)Copyright Spacesoft corp., 2007 All rights reserved.

//                               Hiro KITAYAMA

//

#include <windows.h>

 

//---------------------------------------------------------------------------

//

// 32 bit RGB to Single(float) array

//

void toMonoSingle( LPBYTE pInSrc, LPBYTE pInDst, LONG width, LONG height )

{

    int loopCounter = (height*width)/4;

    float monoConst[]={0.114478f,0.586611f,0.298912f,0.000000f};

 

    _asm

    {

        xor         ebx, ebx

        mov         esi, pInSrc         // esi = src

        mov         edi, pInDst         // edi = dst

        mov         ecx, loopCounter    // loop counter

 

 

        pxor        xmm0,xmm0           // fixed zero

        movdqu      xmm7, monoConst

 

 

                                        // xmm0 = zero(integer)

                                        // xmm2 = source

                                        // xmm5 = destination

                                        // xmm7 = mono constant

    loopLbl:

        movdqa      xmm1, [esi+ebx]     // src

        movdqa      xmm2, xmm1          // save src

 

 

 

                                        // p3, p2=================================

        pshufd      xmm1, xmm2, 0eh     // p3, p2を下位へコピー,----1110b

                                        //

                                        // xmm2:   p3   p2   p1  p0

                                        // xmm1:   --   --   p3  p2

                                        //                                       p3           p2

                                        // xmm1:   -- -- -- --  -- -- -- --  aa bb gg rr  aa bb gg rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p3                   p2

                                        // xmm1:   00aa 00bb 00gg 00rr  00aa 00bb 00gg 00rr

 

        movdqa      xmm3, xmm1          //                                       p3

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00bb 00gg 00rr

 

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000bb  000000gg  000000rr, pixcel2

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000bb  000000gg  000000rr, pixcel3

 

                                        // Pixel 2==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B2  G2  R2, pixcel2(float)

 

                                        // Pixel 3==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B3  G3  R3, pixcel3(float)

 

 

                                        // pixel 3-2の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B3---  G3+R3   ---B2---  G2+R2  ,  (float)

 

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B3+G3+R3 B2+G2+R2 B3+G3+R3 B2+G2+R2,  (float)

 

 

 

 

        movdqa      xmm1, xmm2          // p1, p0=================================

                                        // xmm1:   p3   p2   p1  p0

                                        //                                       p1           p0

                                        // xmm1:   -- -- -- --  -- -- -- --  aa gg bb rr  aa gg bb rr

        punpcklbw   xmm1, xmm0          // B -> W

                                        //                  p1                   p0

                                        // xmm1:   00aa 00gg 00bb 00rr  00aa 00gg 00bb 00rr

                                        //

                                        //                                       p1

        movdqa      xmm3, xmm1

        punpckhqdq  xmm3, xmm1          // xmm3:   ---- ---- ---- ----  00aa 00gg 00bb 00rr

                                        //

        punpcklwd   xmm1, xmm0          // W -> D

                                        // xmm1:   000000aa  000000gg  000000bb  000000rr, pixcel0

        punpcklwd   xmm3, xmm0          // W -> D

                                        // xmm3:   000000aa  000000gg  000000bb  000000rr, pixcel1

                                       

 

                                        // Pixel 0==========

        cvtdq2ps    xmm1, xmm1          // Dword -> Float

        mulps       xmm1, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm1:   00  B0  G0  R0, pixcel0(float)

 

 

                                        // Pixel 1==========

        cvtdq2ps    xmm3, xmm3          // Dword -> Float

        mulps       xmm3, xmm7          // mono= B * 0.114478f

                                        //          + G * 0.586611f

                                        //              + R * 0.298912f);

                                        // xmm3:   00  B1  G1  R1, pixcel1(float)

 

 

                                        // pixel 1-0の処理

        haddps      xmm1, xmm3          // G + R

                                        // xmm1:   ---B1---  G1+R1   ---B0---  G0+R0  ,  (float)

        haddps      xmm1, xmm1          // G + R + B

                                        // xmm1:   B1+G1+R1 B0+G0+R0 B1+G1+R1 B0+G0+R0,  (float)

 

 

        movsd       xmm5, xmm1          //  4 pixelを統合

                                        // xmm5:   000000M3 000000M2 000000M1 000000M0,  (float)

 

        loop    loopLbl

    }

}