/****************************************************************************
 *
 *   Module Title :     newLoopTest_asm.c 
 *
 *   Description  :     Codec specific functions
 *
 *   AUTHOR       :     Yaowu Xu
 *
 *****************************************************************************
 *   Revision History
 *
 *   1.02 YWX 03-Nov-00 Changed confusing variable name
 *   1.01 YWX 02-Nov-00 Added the set of functions
 *   1.00 YWX 19-Oct-00 configuration baseline
 *****************************************************************************
 */ 

/****************************************************************************
 *  Header Frames
 *****************************************************************************
 */


#define STRICT              /* Strict type checking. */
#include "codec_common.h"
#include <math.h>

 /****************************************************************************
 *  Module constants.
 *****************************************************************************
 */        

#define MIN(a, b)  (((a) < (b)) ? (a) : (b))
#define FILTER_WEIGHT 128
#define FILTER_SHIFT  7

extern void UnpackBlock_MMX( UINT8 *ReconPtr, INT16 *ReconRefPtr, UINT32 ReconPixelsPerLine);

static __declspec(align(16)) short rd[]={64,64,64,64,64,64,64,64};


__declspec(align(16)) INT16  BilinearFilters_mmx[8][16] = 
{
{ 128,128,128,128,128,128,128,128,    0,  0, 0,   0,  0,  0,  0,  0 },
{ 112,112,112,112,112,112,112,112,   16, 16, 16, 16, 16, 16, 16, 16 },
{  96, 96, 96, 96, 96, 96, 96, 96,   32, 32, 32, 32, 32, 32, 32, 32 },
{  80, 80, 80, 80, 80, 80, 80, 80,   48, 48, 48, 48, 48, 48, 48, 48 },
{  64, 64, 64, 64, 64, 64, 64, 64,   64, 64, 64, 64, 64, 64, 64, 64 },
{  48, 48, 48, 48, 48, 48, 48, 48,   80, 80, 80, 80, 80, 80, 80, 80 },
{  32, 32, 32, 32, 32, 32, 32, 32,   96, 96, 96, 96, 96, 96, 96, 96 },
{  16, 16, 16, 16, 16, 16, 16, 16,  112,112,112,112,112,112,112,112 }
};

__declspec(align(16)) INT16  BicubicFilters_mmx[17][8][32] = 
{
    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,  122,122,122,122,122,122,122,122,    9,  9,  9,  9,  9,  9,  9,  9,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,  109,109,109,109,109,109,109,109,   24, 24, 24, 24, 24, 24, 24, 24,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   91, 91, 91, 91, 91, 91, 91, 91,   45, 45, 45, 45, 45, 45, 45, 45,   -3, -3, -3, -3, -3, -3, -3, -3, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   68, 68, 68, 68, 68, 68, 68, 68,   68, 68, 68, 68, 68, 68, 68, 68,   -4, -4, -4, -4, -4, -4, -4, -4, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,   45, 45, 45, 45, 45, 45, 45, 45,   91, 91, 91, 91, 91, 91, 91, 91,   -5, -5, -5, -5, -5, -5, -5, -5, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   24, 24, 24, 24, 24, 24, 24, 24,  109,109,109,109,109,109,109,109,   -4, -4, -4, -4, -4, -4, -4, -4, },
        {   0,  0,  0,  0,  0,  0,  0,  0,    9,  9,  9,  9,  9,  9,  9,  9,  122,122,122,122,122,122,122,122,   -3, -3, -3, -3, -3, -3, -3, -3, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,  124,124,124,124,124,124,124,124,    9,  9,  9,  9,  9,  9,  9,  9,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,  110,110,110,110,110,110,110,110,   25, 25, 25, 25, 25, 25, 25, 25,   -2, -2, -2, -2, -2, -2, -2, -2, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   91, 91, 91, 91, 91, 91, 91, 91,   46, 46, 46, 46, 46, 46, 46, 46,   -3, -3, -3, -3, -3, -3, -3, -3, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   69, 69, 69, 69, 69, 69, 69, 69,   69, 69, 69, 69, 69, 69, 69, 69,   -5, -5, -5, -5, -5, -5, -5, -5, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,   46, 46, 46, 46, 46, 46, 46, 46,   91, 91, 91, 91, 91, 91, 91, 91,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   25, 25, 25, 25, 25, 25, 25, 25,  110,110,110,110,110,110,110,110,   -5, -5, -5, -5, -5, -5, -5, -5, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,    9,  9,  9,  9,  9,  9,  9,  9,  124,124,124,124,124,124,124,124,   -4, -4, -4, -4, -4, -4, -4, -4, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,  123,123,123,123,123,123,123,123,   10, 10, 10, 10, 10, 10, 10, 10,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,  110,110,110,110,110,110,110,110,   26, 26, 26, 26, 26, 26, 26, 26,   -2, -2, -2, -2, -2, -2, -2, -2, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,   92, 92, 92, 92, 92, 92, 92, 92,   47, 47, 47, 47, 47, 47, 47, 47,   -4, -4, -4, -4, -4, -4, -4, -4, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   70, 70, 70, 70, 70, 70, 70, 70,   70, 70, 70, 70, 70, 70, 70, 70,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   47, 47, 47, 47, 47, 47, 47, 47,   92, 92, 92, 92, 92, 92, 92, 92,   -7, -7, -7, -7, -7, -7, -7, -7, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   26, 26, 26, 26, 26, 26, 26, 26,  110,110,110,110,110,110,110,110,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   10, 10, 10, 10, 10, 10, 10, 10,  123,123,123,123,123,123,123,123,   -4, -4, -4, -4, -4, -4, -4, -4, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,  124,124,124,124,124,124,124,124,   10, 10, 10, 10, 10, 10, 10, 10,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,  110,110,110,110,110,110,110,110,   27, 27, 27, 27, 27, 27, 27, 27,   -2, -2, -2, -2, -2, -2, -2, -2, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,   91, 91, 91, 91, 91, 91, 91, 91,   48, 48, 48, 48, 48, 48, 48, 48,   -4, -4, -4, -4, -4, -4, -4, -4, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   70, 70, 70, 70, 70, 70, 70, 70,   70, 70, 70, 70, 70, 70, 70, 70,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   48, 48, 48, 48, 48, 48, 48, 48,   92, 92, 92, 92, 92, 92, 92, 92,   -8, -8, -8, -8, -8, -8, -8, -8, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   27, 27, 27, 27, 27, 27, 27, 27,  110,110,110,110,110,110,110,110,   -7, -7, -7, -7, -7, -7, -7, -7, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   10, 10, 10, 10, 10, 10, 10, 10,  124,124,124,124,124,124,124,124,   -5, -5, -5, -5, -5, -5, -5, -5, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,  124,124,124,124,124,124,124,124,   11, 11, 11, 11, 11, 11, 11, 11,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,  111,111,111,111,111,111,111,111,   28, 28, 28, 28, 28, 28, 28, 28,   -3, -3, -3, -3, -3, -3, -3, -3, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,   92, 92, 92, 92, 92, 92, 92, 92,   49, 49, 49, 49, 49, 49, 49, 49,   -5, -5, -5, -5, -5, -5, -5, -5, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,   71, 71, 71, 71, 71, 71, 71, 71,   71, 71, 71, 71, 71, 71, 71, 71,   -7, -7, -7, -7, -7, -7, -7, -7, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   49, 49, 49, 49, 49, 49, 49, 49,   92, 92, 92, 92, 92, 92, 92, 92,   -8, -8, -8, -8, -8, -8, -8, -8, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,   28, 28, 28, 28, 28, 28, 28, 28,  111,111,111,111,111,111,111,111,   -8, -8, -8, -8, -8, -8, -8, -8, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   11, 11, 11, 11, 11, 11, 11, 11,  124,124,124,124,124,124,124,124,   -6, -6, -6, -6, -6, -6, -6, -6, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,  123,123,123,123,123,123,123,123,   12, 12, 12, 12, 12, 12, 12, 12,   -1, -1, -1, -1, -1, -1, -1, -1, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,  111,111,111,111,111,111,111,111,   29, 29, 29, 29, 29, 29, 29, 29,   -3, -3, -3, -3, -3, -3, -3, -3, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,   93, 93, 93, 93, 93, 93, 93, 93,   50, 50, 50, 50, 50, 50, 50, 50,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,   72, 72, 72, 72, 72, 72, 72, 72,   72, 72, 72, 72, 72, 72, 72, 72,   -8, -8, -8, -8, -8, -8, -8, -8, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   50, 50, 50, 50, 50, 50, 50, 50,   93, 93, 93, 93, 93, 93, 93, 93,   -9, -9, -9, -9, -9, -9, -9, -9, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,   29, 29, 29, 29, 29, 29, 29, 29,  111,111,111,111,111,111,111,111,   -9, -9, -9, -9, -9, -9, -9, -9, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   12, 12, 12, 12, 12, 12, 12, 12,  123,123,123,123,123,123,123,123,   -6, -6, -6, -6, -6, -6, -6, -6, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,  124,124,124,124,124,124,124,124,   12, 12, 12, 12, 12, 12, 12, 12,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -10,-10,-10,-10,-10,-10,-10,-10,  111,111,111,111,111,111,111,111,   30, 30, 30, 30, 30, 30, 30, 30,   -3, -3, -3, -3, -3, -3, -3, -3, },
        { -10,-10,-10,-10,-10,-10,-10,-10,   93, 93, 93, 93, 93, 93, 93, 93,   51, 51, 51, 51, 51, 51, 51, 51,   -6, -6, -6, -6, -6, -6, -6, -6, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,   73, 73, 73, 73, 73, 73, 73, 73,   73, 73, 73, 73, 73, 73, 73, 73,   -9, -9, -9, -9, -9, -9, -9, -9, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   51, 51, 51, 51, 51, 51, 51, 51,   93, 93, 93, 93, 93, 93, 93, 93,  -10,-10,-10,-10,-10,-10,-10,-10, },
        {  -3, -3, -3, -3, -3, -3, -3, -3,   30, 30, 30, 30, 30, 30, 30, 30,  111,111,111,111,111,111,111,111,  -10,-10,-10,-10,-10,-10,-10,-10, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   12, 12, 12, 12, 12, 12, 12, 12,  124,124,124,124,124,124,124,124,   -7, -7, -7, -7, -7, -7, -7, -7, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,  123,123,123,123,123,123,123,123,   13, 13, 13, 13, 13, 13, 13, 13,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -11,-11,-11,-11,-11,-11,-11,-11,  112,112,112,112,112,112,112,112,   31, 31, 31, 31, 31, 31, 31, 31,   -4, -4, -4, -4, -4, -4, -4, -4, },
        { -11,-11,-11,-11,-11,-11,-11,-11,   94, 94, 94, 94, 94, 94, 94, 94,   52, 52, 52, 52, 52, 52, 52, 52,   -7, -7, -7, -7, -7, -7, -7, -7, },
        { -10,-10,-10,-10,-10,-10,-10,-10,   74, 74, 74, 74, 74, 74, 74, 74,   74, 74, 74, 74, 74, 74, 74, 74,  -10,-10,-10,-10,-10,-10,-10,-10, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,   52, 52, 52, 52, 52, 52, 52, 52,   94, 94, 94, 94, 94, 94, 94, 94,  -11,-11,-11,-11,-11,-11,-11,-11, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   31, 31, 31, 31, 31, 31, 31, 31,  112,112,112,112,112,112,112,112,  -11,-11,-11,-11,-11,-11,-11,-11, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   13, 13, 13, 13, 13, 13, 13, 13,  123,123,123,123,123,123,123,123,   -7, -7, -7, -7, -7, -7, -7, -7, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,  124,124,124,124,124,124,124,124,   13, 13, 13, 13, 13, 13, 13, 13,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -12,-12,-12,-12,-12,-12,-12,-12,  112,112,112,112,112,112,112,112,   32, 32, 32, 32, 32, 32, 32, 32,   -4, -4, -4, -4, -4, -4, -4, -4, },
        { -12,-12,-12,-12,-12,-12,-12,-12,   94, 94, 94, 94, 94, 94, 94, 94,   53, 53, 53, 53, 53, 53, 53, 53,   -7, -7, -7, -7, -7, -7, -7, -7, },
        { -10,-10,-10,-10,-10,-10,-10,-10,   74, 74, 74, 74, 74, 74, 74, 74,   74, 74, 74, 74, 74, 74, 74, 74,  -10,-10,-10,-10,-10,-10,-10,-10, },
        {  -7, -7, -7, -7, -7, -7, -7, -7,   53, 53, 53, 53, 53, 53, 53, 53,   94, 94, 94, 94, 94, 94, 94, 94,  -12,-12,-12,-12,-12,-12,-12,-12, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   32, 32, 32, 32, 32, 32, 32, 32,  112,112,112,112,112,112,112,112,  -12,-12,-12,-12,-12,-12,-12,-12, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   13, 13, 13, 13, 13, 13, 13, 13,  124,124,124,124,124,124,124,124,   -8, -8, -8, -8, -8, -8, -8, -8, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,  124,124,124,124,124,124,124,124,   14, 14, 14, 14, 14, 14, 14, 14,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -13,-13,-13,-13,-13,-13,-13,-13,  112,112,112,112,112,112,112,112,   33, 33, 33, 33, 33, 33, 33, 33,   -4, -4, -4, -4, -4, -4, -4, -4, },
        { -13,-13,-13,-13,-13,-13,-13,-13,   95, 95, 95, 95, 95, 95, 95, 95,   54, 54, 54, 54, 54, 54, 54, 54,   -8, -8, -8, -8, -8, -8, -8, -8, },
        { -11,-11,-11,-11,-11,-11,-11,-11,   75, 75, 75, 75, 75, 75, 75, 75,   75, 75, 75, 75, 75, 75, 75, 75,  -11,-11,-11,-11,-11,-11,-11,-11, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,   54, 54, 54, 54, 54, 54, 54, 54,   95, 95, 95, 95, 95, 95, 95, 95,  -13,-13,-13,-13,-13,-13,-13,-13, },
        {  -4, -4, -4, -4, -4, -4, -4, -4,   33, 33, 33, 33, 33, 33, 33, 33,  112,112,112,112,112,112,112,112,  -13,-13,-13,-13,-13,-13,-13,-13, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   14, 14, 14, 14, 14, 14, 14, 14,  124,124,124,124,124,124,124,124,   -9, -9, -9, -9, -9, -9, -9, -9, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,  123,123,123,123,123,123,123,123,   15, 15, 15, 15, 15, 15, 15, 15,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -14,-14,-14,-14,-14,-14,-14,-14,  113,113,113,113,113,113,113,113,   34, 34, 34, 34, 34, 34, 34, 34,   -5, -5, -5, -5, -5, -5, -5, -5, },
        { -14,-14,-14,-14,-14,-14,-14,-14,   95, 95, 95, 95, 95, 95, 95, 95,   55, 55, 55, 55, 55, 55, 55, 55,   -8, -8, -8, -8, -8, -8, -8, -8, },
        { -12,-12,-12,-12,-12,-12,-12,-12,   76, 76, 76, 76, 76, 76, 76, 76,   76, 76, 76, 76, 76, 76, 76, 76,  -12,-12,-12,-12,-12,-12,-12,-12, },
        {  -8, -8, -8, -8, -8, -8, -8, -8,   55, 55, 55, 55, 55, 55, 55, 55,   95, 95, 95, 95, 95, 95, 95, 95,  -14,-14,-14,-14,-14,-14,-14,-14, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   34, 34, 34, 34, 34, 34, 34, 34,  112,112,112,112,112,112,112,112,  -13,-13,-13,-13,-13,-13,-13,-13, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   15, 15, 15, 15, 15, 15, 15, 15,  123,123,123,123,123,123,123,123,   -9, -9, -9, -9, -9, -9, -9, -9, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        { -10,-10,-10,-10,-10,-10,-10,-10,  124,124,124,124,124,124,124,124,   15, 15, 15, 15, 15, 15, 15, 15,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -14,-14,-14,-14,-14,-14,-14,-14,  113,113,113,113,113,113,113,113,   34, 34, 34, 34, 34, 34, 34, 34,   -5, -5, -5, -5, -5, -5, -5, -5, },
        { -15,-15,-15,-15,-15,-15,-15,-15,   96, 96, 96, 96, 96, 96, 96, 96,   56, 56, 56, 56, 56, 56, 56, 56,   -9, -9, -9, -9, -9, -9, -9, -9, },
        { -13,-13,-13,-13,-13,-13,-13,-13,   77, 77, 77, 77, 77, 77, 77, 77,   77, 77, 77, 77, 77, 77, 77, 77,  -13,-13,-13,-13,-13,-13,-13,-13, },
        {  -9, -9, -9, -9, -9, -9, -9, -9,   56, 56, 56, 56, 56, 56, 56, 56,   96, 96, 96, 96, 96, 96, 96, 96,  -15,-15,-15,-15,-15,-15,-15,-15, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   34, 34, 34, 34, 34, 34, 34, 34,  113,113,113,113,113,113,113,113,  -14,-14,-14,-14,-14,-14,-14,-14, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   15, 15, 15, 15, 15, 15, 15, 15,  124,124,124,124,124,124,124,124,  -10,-10,-10,-10,-10,-10,-10,-10, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        { -10,-10,-10,-10,-10,-10,-10,-10,  123,123,123,123,123,123,123,123,   16, 16, 16, 16, 16, 16, 16, 16,   -1, -1, -1, -1, -1, -1, -1, -1, },
        { -15,-15,-15,-15,-15,-15,-15,-15,  113,113,113,113,113,113,113,113,   35, 35, 35, 35, 35, 35, 35, 35,   -5, -5, -5, -5, -5, -5, -5, -5, },
        { -16,-16,-16,-16,-16,-16,-16,-16,   98, 98, 98, 98, 98, 98, 98, 98,   56, 56, 56, 56, 56, 56, 56, 56,  -10,-10,-10,-10,-10,-10,-10,-10, },
        { -14,-14,-14,-14,-14,-14,-14,-14,   78, 78, 78, 78, 78, 78, 78, 78,   78, 78, 78, 78, 78, 78, 78, 78,  -14,-14,-14,-14,-14,-14,-14,-14, },
        { -10,-10,-10,-10,-10,-10,-10,-10,   56, 56, 56, 56, 56, 56, 56, 56,   98, 98, 98, 98, 98, 98, 98, 98,  -16,-16,-16,-16,-16,-16,-16,-16, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   35, 35, 35, 35, 35, 35, 35, 35,  113,113,113,113,113,113,113,113,  -15,-15,-15,-15,-15,-15,-15,-15, },
        {  -1, -1, -1, -1, -1, -1, -1, -1,   16, 16, 16, 16, 16, 16, 16, 16,  123,123,123,123,123,123,123,123,  -10,-10,-10,-10,-10,-10,-10,-10, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        { -11,-11,-11,-11,-11,-11,-11,-11,  124,124,124,124,124,124,124,124,   17, 17, 17, 17, 17, 17, 17, 17,   -2, -2, -2, -2, -2, -2, -2, -2, },
        { -16,-16,-16,-16,-16,-16,-16,-16,  113,113,113,113,113,113,113,113,   36, 36, 36, 36, 36, 36, 36, 36,   -5, -5, -5, -5, -5, -5, -5, -5, },
        { -17,-17,-17,-17,-17,-17,-17,-17,   98, 98, 98, 98, 98, 98, 98, 98,   57, 57, 57, 57, 57, 57, 57, 57,  -10,-10,-10,-10,-10,-10,-10,-10, },
        { -14,-14,-14,-14,-14,-14,-14,-14,   78, 78, 78, 78, 78, 78, 78, 78,   78, 78, 78, 78, 78, 78, 78, 78,  -14,-14,-14,-14,-14,-14,-14,-14, },
        { -10,-10,-10,-10,-10,-10,-10,-10,   57, 57, 57, 57, 57, 57, 57, 57,   98, 98, 98, 98, 98, 98, 98, 98,  -17,-17,-17,-17,-17,-17,-17,-17, },
        {  -5, -5, -5, -5, -5, -5, -5, -5,   36, 36, 36, 36, 36, 36, 36, 36,  113,113,113,113,113,113,113,113,  -16,-16,-16,-16,-16,-16,-16,-16, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   17, 17, 17, 17, 17, 17, 17, 17,  124,124,124,124,124,124,124,124,  -11,-11,-11,-11,-11,-11,-11,-11, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        { -12,-12,-12,-12,-12,-12,-12,-12,  125,125,125,125,125,125,125,125,   17, 17, 17, 17, 17, 17, 17, 17,   -2, -2, -2, -2, -2, -2, -2, -2, },
        { -17,-17,-17,-17,-17,-17,-17,-17,  114,114,114,114,114,114,114,114,   37, 37, 37, 37, 37, 37, 37, 37,   -6, -6, -6, -6, -6, -6, -6, -6, },
        { -18,-18,-18,-18,-18,-18,-18,-18,   99, 99, 99, 99, 99, 99, 99, 99,   58, 58, 58, 58, 58, 58, 58, 58,  -11,-11,-11,-11,-11,-11,-11,-11, },
        { -15,-15,-15,-15,-15,-15,-15,-15,   79, 79, 79, 79, 79, 79, 79, 79,   79, 79, 79, 79, 79, 79, 79, 79,  -15,-15,-15,-15,-15,-15,-15,-15, },
        { -11,-11,-11,-11,-11,-11,-11,-11,   58, 58, 58, 58, 58, 58, 58, 58,   99, 99, 99, 99, 99, 99, 99, 99,  -18,-18,-18,-18,-18,-18,-18,-18, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   37, 37, 37, 37, 37, 37, 37, 37,  114,114,114,114,114,114,114,114,  -17,-17,-17,-17,-17,-17,-17,-17, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   17, 17, 17, 17, 17, 17, 17, 17,  125,125,125,125,125,125,125,125,  -12,-12,-12,-12,-12,-12,-12,-12, },
    },

    {
        {   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0, },
        { -12,-12,-12,-12,-12,-12,-12,-12,  124,124,124,124,124,124,124,124,   18, 18, 18, 18, 18, 18, 18, 18,   -2, -2, -2, -2, -2, -2, -2, -2, },
        { -18,-18,-18,-18,-18,-18,-18,-18,  114,114,114,114,114,114,114,114,   38, 38, 38, 38, 38, 38, 38, 38,   -6, -6, -6, -6, -6, -6, -6, -6, },
        { -19,-19,-19,-19,-19,-19,-19,-19,   99, 99, 99, 99, 99, 99, 99, 99,   59, 59, 59, 59, 59, 59, 59, 59,  -11,-11,-11,-11,-11,-11,-11,-11, },
        { -16,-16,-16,-16,-16,-16,-16,-16,   80, 80, 80, 80, 80, 80, 80, 80,   80, 80, 80, 80, 80, 80, 80, 80,  -16,-16,-16,-16,-16,-16,-16,-16, },
        { -11,-11,-11,-11,-11,-11,-11,-11,   59, 59, 59, 59, 59, 59, 59, 59,   99, 99, 99, 99, 99, 99, 99, 99,  -19,-19,-19,-19,-19,-19,-19,-19, },
        {  -6, -6, -6, -6, -6, -6, -6, -6,   38, 38, 38, 38, 38, 38, 38, 38,  114,114,114,114,114,114,114,114,  -18,-18,-18,-18,-18,-18,-18,-18, },
        {  -2, -2, -2, -2, -2, -2, -2, -2,   18, 18, 18, 18, 18, 18, 18, 18,  124,124,124,124,124,124,124,124,  -12,-12,-12,-12,-12,-12,-12,-12, },
    },

	// Dummy entry for VP61 supporty
	{
		{   0,  0,  0,  0,  0,  0,  0,  0,  128,128,128,128,128,128,128,128,    0,  0,  0,  0,  0,  0,  0,  0,    0,  0,  0,  0,  0,  0,  0,  0 },
		{  -4, -4, -4, -4, -4, -4, -4, -4,  118,118,118,118,118,118,118,118,   16, 16, 16, 16, 16, 16, 16, 16,   -2, -2, -2, -2, -2, -2, -2, -2 },
		{  -7, -7, -7, -7, -7, -7, -7, -7,  106,106,106,106,106,106,106,106,   34, 34, 34, 34, 34, 34, 34, 34,   -5, -5, -5, -5, -5, -5, -5, -5 },
		{  -8, -8, -8, -8, -8, -8, -8, -8,   90, 90, 90, 90, 90, 90, 90, 90,   53, 53, 53, 53, 53, 53, 53, 53,   -7, -7, -7, -7, -7, -7, -7, -7 },
		{  -8, -8, -8, -8, -8, -8, -8, -8,   72, 72, 72, 72, 72, 72, 72, 72,   72, 72, 72, 72, 72, 72, 72, 72,   -8, -8, -8, -8, -8, -8, -8, -8 },
		{  -7, -7, -7, -7, -7, -7, -7, -7,   53, 53, 53, 53, 53, 53, 53, 53,   90, 90, 90, 90, 90, 90, 90, 90,   -8, -8, -8, -8, -8, -8, -8, -8 },
		{  -5, -5, -5, -5, -5, -5, -5, -5,   34, 34, 34, 34, 34, 34, 34, 34,  106,106,106,106,106,106,106,106,   -7, -7, -7, -7, -7, -7, -7, -7 },
		{  -2, -2, -2, -2, -2, -2, -2, -2,   16, 16, 16, 16, 16, 16, 16, 16,  118,118,118,118,118,118,118,118,   -4, -4, -4, -4, -4, -4, -4, -4 }
	}

};



void FilterBlock1d_h_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq      mm1, [edi]             ; mm3 *= kernel 0 modifiers.
        movq      mm2, [edi+ 16]         ; mm3 *= kernel 0 modifiers.
        movq      mm6, [edi + 32]        ; mm3 *= kernel 0 modifiers.
        movq      mm7, [edi + 48]        ; mm3 *= kernel 0 modifiers.

        mov         edi,OutputPtr
		mov			esi,SrcPtr
        dec         esi
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth      ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000

nextrow:
        movq		mm3, [esi]            ; mm3 = p-1..p6    
        movq        mm4, mm3              ; mm4 = p-1..p6
        punpcklbw   mm3, mm0              ; mm3 = p-1..p2
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 24               ; mm4 = p2..p6
        movq        mm5, mm4              ; mm5 = p2..p6
        punpcklbw   mm5, mm0              ; mm5 = p2..p5
        pmullw      mm5, mm7              ; mm5 *= kernel 3 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5

        movq        mm4, [esi+1]          ; mm4 = p0..p6
        movq        mm5, mm4              ; mm5 = p0..p6
        punpcklbw   mm5, mm0              ; mm5 = p0..p3
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p1..p6
        movq        mm5, mm4              ; mm5 = p1..p6
        punpcklbw   mm5, mm0              ; mm5 = p1..p4
        pmullw      mm5, mm6              ; mm5 *= kernel 2 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5


        paddsw      mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi],mm3             ; store the results in the destination


        movq		mm3, [esi+4]           ; mm3 = p-1..p6    
        movq        mm4, mm3              ; mm4 = p-1..p6
        punpcklbw   mm3, mm0              ; mm3 = p-1..p2
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 24               ; mm4 = p2..p6
        movq        mm5, mm4              ; mm5 = p2..p6
        punpcklbw   mm5, mm0              ; mm5 = p2..p5
        pmullw      mm5, mm7              ; mm5 *= kernel 3 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5

        movq        mm4, [esi+5]          ; mm4 = p0..p6
        movq        mm5, mm4              ; mm5 = p0..p6
        punpcklbw   mm5, mm0              ; mm5 = p0..p3
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p1..p6
        movq        mm5, mm4              ; mm5 = p1..p6
        punpcklbw   mm5, mm0              ; mm5 = p1..p4
        pmullw      mm5, mm6              ; mm5 *= kernel 2 modifiers
        paddsw      mm3, mm5              ; mm3 += mm5


        paddsw      mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd       [edi+4],mm3               ; store the results in the destination

        add         esi,SrcPixelsPerLine    ; next line
        add         edi,eax; 

        dec         ecx                     ; decrement count
        jnz         nextrow                 ; next row
    }
}


void FilterBlock1d_v_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq      mm1, [edi]          ; mm3 *= kernel 0 modifiers.
        movq      mm2, [edi + 16]     ; mm3 *= kernel 0 modifiers.
        movq      mm6, [edi + 32]     ; mm3 *= kernel 0 modifiers.
        movq      mm7, [edi + 48]     ; mm3 *= kernel 0 modifiers.

        mov         edx, PixelsPerLine
        mov         edi, OutputPtr
		mov			esi, SrcPtr
        sub         esi, PixelsPerLine
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth      ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000


nextrow:
        movq		mm3, [esi]            ; mm3 = p0..p8
        punpcklbw   mm3, mm0              ; mm3 = p0..p3
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        add         esi, edx              ; move source forward 1 line to avoid 3 * pitch

        movq		mm4, [esi+2*edx]      ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm7              ; mm4 *= kernel 3 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi ]           ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi +edx]       ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm6              ; mm4 *= kernel 2 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4


        paddsw      mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and saturate

        movd        [edi],mm3             ; store the results in the destination
        
        sub         esi, edx              ;  subtract edx to get back to -1 column

        movq		mm3, [esi+4]          ; mm3 = p4..p12
        punpcklbw   mm3, mm0              ; mm3 = p4..p7
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        add         esi, edx              ; move source forward 1 line to avoid 3 * pitch

        movq		mm4, [esi+2*edx+4]    ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm7              ; mm4 *= kernel 3 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi +4]         ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi +edx+4]     ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm6              ; mm4 *= kernel 2 modifiers.
        paddsw      mm3, mm4              ; mm3 += mm4


        paddsw      mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and saturate

        movd        [edi+4],mm3           ; store the results in the destination



        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
        // recon block should be in cache this shouldn't cost much.  Its obviously 
        // avoidable!!!. 
        add         edi,eax; 

        dec         ecx                   ; decrement count
        jnz         nextrow               ; next row

    }
}


void FilterBlock1d_h_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq      mm1, [edi]             ; mm3 *= kernel 0 modifiers.
        movq      mm2, [edi+ 16]         ; mm3 *= kernel 0 modifiers.
        movq      mm6, [edi + 32]        ; mm3 *= kernel 0 modifiers.
        movq      mm7, [edi + 48]        ; mm3 *= kernel 0 modifiers.

        mov         edi,OutputPtr
		mov			esi,SrcPtr
        dec         esi
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth      ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000

nextrow:
        movq		mm3, [esi]            ; mm3 = p-1..p6    
        movq        mm4, mm3              ; mm4 = p-1..p6
        punpcklbw   mm3, mm0              ; mm3 = p-1..p2
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 8                ; mm4 = p0..p6
        movq        mm5, mm4              ; mm5 = p0..p6
        punpcklbw   mm5, mm0              ; mm5 = p0..p3
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p1..p6
        movq        mm5, mm4              ; mm5 = p1..p6
        punpcklbw   mm5, mm0              ; mm5 = p1..p4
        pmullw      mm5, mm6              ; mm5 *= kernel 2 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p2..p6
        movq        mm5, mm4              ; mm5 = p2..p6
        punpcklbw   mm5, mm0              ; mm5 = p2..p5
        pmullw      mm5, mm7              ; mm5 *= kernel 3 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi],mm3             ; store the results in the destination


        movq		mm3, [esi+4]           ; mm3 = p-1..p6    
        movq        mm4, mm3              ; mm4 = p-1..p6
        punpcklbw   mm3, mm0              ; mm3 = p-1..p2
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 8                ; mm4 = p0..p6
        movq        mm5, mm4              ; mm5 = p0..p6
        punpcklbw   mm5, mm0              ; mm5 = p0..p3
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p1..p6
        movq        mm5, mm4              ; mm5 = p1..p6
        punpcklbw   mm5, mm0              ; mm5 = p1..p4
        pmullw      mm5, mm6              ; mm5 *= kernel 2 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        psrlq       mm4, 8                ; mm4 = p2..p6
        movq        mm5, mm4              ; mm5 = p2..p6
        punpcklbw   mm5, mm0              ; mm5 = p2..p5
        pmullw      mm5, mm7              ; mm5 *= kernel 3 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd       [edi+4],mm3               ; store the results in the destination

        add         esi,SrcPixelsPerLine    ; next line
        add         edi,eax; 

        dec         ecx                     ; decrement count
        jnz         nextrow                 ; next row
    }
}


void FilterBlock1d_v_mmxa( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq      mm1, [edi]          ; mm3 *= kernel 0 modifiers.
        movq      mm2, [edi + 16]     ; mm3 *= kernel 0 modifiers.
        movq      mm6, [edi + 32]     ; mm3 *= kernel 0 modifiers.
        movq      mm7, [edi + 48]     ; mm3 *= kernel 0 modifiers.

        mov         edx, PixelsPerLine
        mov         edi, OutputPtr
		mov			esi, SrcPtr
        sub         esi, PixelsPerLine
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth      ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000


nextrow:
        movq		mm3, [esi]            ; mm3 = p0..p8
        punpcklbw   mm3, mm0              ; mm3 = p0..p3
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        movq		mm4, [esi +edx ]      ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi +2*edx]     ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm6              ; mm4 *= kernel 2 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        add         esi, edx              ; move source forward 1 line to avoid 3 * pitch

        movq		mm4, [esi+2*edx]      ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm7              ; mm4 *= kernel 3 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and saturate

        movd        [edi],mm3             ; store the results in the destination
        
        sub         esi, edx              ;  subtract edx to get back to -1 column

        movq		mm3, [esi+4]          ; mm3 = p4..p12
        punpcklbw   mm3, mm0              ; mm3 = p4..p7
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        movq		mm4, [esi +edx +4]      ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        movq		mm4, [esi +2*edx+4]   ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm6              ; mm4 *= kernel 2 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        add         esi, edx              ; move source forward 1 line to avoid 3 * pitch

        movq		mm4, [esi+2*edx+4]    ; mm4 = p0..p8
        punpcklbw   mm4, mm0              ; mm4 = p0..p3
        pmullw      mm4, mm7              ; mm4 *= kernel 3 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and saturate

        movd        [edi+4],mm3           ; store the results in the destination



        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
        // recon block should be in cache this shouldn't cost much.  Its obviously 
        // avoidable!!!. 
        add         edi,eax; 

        dec         ecx                   ; decrement count
        jnz         nextrow               ; next row

    }
}


void FilterBlock1d_hb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq        mm1, [edi]            ; mm3 *= kernel 0 modifiers.
        movq        mm2, [edi + 16]       ; mm3 *= kernel 0 modifiers.

        mov         edi,OutputPtr
		mov			esi,SrcPtr
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth      ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000

nextrow:
        movq		mm3, [esi]            ; mm3 = p-1..p14    
        movq        mm4, mm3                ; mm4 = p-1..p14
        punpcklbw   mm3, mm0              ; mm3 = p-1..p6
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 8                 ; mm4 = p0..p13
        movq        mm5, mm4              ; mm5 = p0..p13
        punpcklbw   mm5, mm0              ; mm5 = p0..p7
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        paddw       mm3, rd                ; mm3 += round value
        psraw       mm3, FILTER_SHIFT      ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi],mm3               ; store the results in the destination

        movq		mm3, [esi+4]            ; mm3 = p-1..p14    
        movq        mm4, mm3                ; mm4 = p-1..p14
        punpcklbw   mm3, mm0              ; mm3 = p-1..p6
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        psrlq       mm4, 8                 ; mm4 = p0..p13
        movq        mm5, mm4              ; mm5 = p0..p13
        punpcklbw   mm5, mm0              ; mm5 = p0..p7
        pmullw      mm5, mm2              ; mm5 *= kernel 1 modifiers
        paddw       mm3, mm5              ; mm3 += mm5

        paddw       mm3, rd                ; mm3 += round value
        psraw       mm3, FILTER_SHIFT      ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi+4],mm3               ; store the results in the destination


        add         esi,SrcPixelsPerLine    ; next line
        add         edi,eax; 

        dec         ecx                     ; decrement count
        jnz         nextrow                 ; next row
    }
}


void FilterBlock1d_vb8_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 PixelsPerLine, UINT32 PixelStep, UINT32 OutputHeight, UINT32 OutputWidth, INT16 * Filter )
{
    __asm
    {

        mov         edi, Filter
        movq      mm1, [edi]          ; mm3 *= kernel 0 modifiers.
        movq      mm2, [edi + 16]     ; mm3 *= kernel 0 modifiers.
        mov         edx, PixelsPerLine
        mov         edi, OutputPtr
		mov			esi, SrcPtr
        mov         ecx, DWORD PTR OutputHeight
        mov         eax, OutputWidth        ; destination pitch?
		pxor		mm0, mm0              ; mm0 = 00000000


nextrow:
        movq		mm3, [esi]            ; mm3 = p0..p16
        punpcklbw   mm3, mm0              ; mm3 = p0..p8
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        movq		mm4, [esi +edx ]      ; mm4 = p0..p16
        punpcklbw   mm4, mm0              ; mm4 = p0..p8
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi],mm3             ; store the results in the destination

        movq		mm3, [esi+4]          ; mm3 = p0..p16
        punpcklbw   mm3, mm0              ; mm3 = p0..p8
        pmullw      mm3, mm1              ; mm3 *= kernel 0 modifiers.

        movq		mm4, [esi +edx +4]    ; mm4 = p0..p16
        punpcklbw   mm4, mm0              ; mm4 = p0..p8
        pmullw      mm4, mm2              ; mm4 *= kernel 1 modifiers.
        paddw       mm3, mm4              ; mm3 += mm4

        paddw       mm3, rd               ; mm3 += round value
        psraw       mm3, FILTER_SHIFT     ; mm3 /= 128
        packuswb    mm3, mm0              ; pack and unpack to saturate

        movd        [edi+4],mm3           ; store the results in the destination

        // the subsequent iterations repeat 3 out of 4 of these reads.  Since the 
        // recon block should be in cache this shouldn't cost much.  Its obviously 
        // avoidable!!!. 
        add         esi,edx
        add         edi,eax 

        dec         ecx                     ; decrement count
        jnz         nextrow                 ; next row

    }
}
 
/****************************************************************************
 * 
 *  ROUTINE       :     FilterBlock2dBil
 *  
 *  INPUTS        :     Pointer to source data
 *						
 *  OUTPUTS       :     Filtered data
 *
 *  RETURNS       :     None.
 *
 *  FUNCTION      :     Applies a bilinear filter on the intput data to produce
 *						a predictor block (UINT16)
 *
 *  SPECIAL NOTES :     
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
_inline
void FilterBlock2dBil_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
{

    __asm
    {
        mov         eax,        HFilter             ; 
        mov         edi,        OutputPtr           ; 
        mov         esi,        SrcPtr              ;
        lea         ecx,        [edi+64]            ;
        mov         edx,        SrcPixelsPerLine    ;
               
        movq        mm1,        [eax]               ;
        movq        mm2,        [eax+16]            ;
        
        mov         eax,        VFilter             ;       
        pxor        mm0,        mm0                 ;

        // get the first horizontal line done       ;
        movq        mm3,        [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line
        
        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [esi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;
        
        paddw       mm3,        rd                  ; xmm3 += round value
        psraw       mm3,        FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        rd                  ;
        psraw       mm4,        FILTER_SHIFT        ;
        
        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;

        add         esi,        edx                 ; next line
NextRow:
        movq        mm3,        [esi]               ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14
        movq        mm4,        mm3                 ; make a copy of current line
        
        punpcklbw   mm3,        mm0                 ; xx 00 01 02 03 04 05 06
        punpckhbw   mm4,        mm0                 ;

        pmullw      mm3,        mm1                 ;
        pmullw      mm4,        mm1                 ;

        movq        mm5,        [esi+1]             ;
        movq        mm6,        mm5                 ;

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0                 ;

        pmullw      mm5,        mm2                 ;
        pmullw      mm6,        mm2                 ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;
        
        movq        mm5,        mm7                 ;
        movq        mm6,        mm7                 ;                

        punpcklbw   mm5,        mm0                 ;
        punpckhbw   mm6,        mm0

        pmullw      mm5,        [eax]               ;
        pmullw      mm6,        [eax]               ;
        
        paddw       mm3,        rd                  ; xmm3 += round value
        psraw       mm3,        FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        rd                  ;
        psraw       mm4,        FILTER_SHIFT        ;
        
        movq        mm7,        mm3                 ;
        packuswb    mm7,        mm4                 ;    
        

        pmullw      mm3,        [eax+16]            ;
        pmullw      mm4,        [eax+16]            ;

        paddw       mm3,        mm5                 ;
        paddw       mm4,        mm6                 ;
        
        
        paddw       mm3,        rd                  ; xmm3 += round value
        psraw       mm3,        FILTER_SHIFT        ; xmm3 /= 128

        paddw       mm4,        rd                  ;
        psraw       mm4,        FILTER_SHIFT        ;
               
        packuswb    mm3,        mm4                                         

        movq        [edi],      mm3                 ; store the results in the destination

        add         esi,        edx                 ; next line
        add         edi,        8                   ; 

        cmp         edi,        ecx                 ;
        jne         NextRow                         

    }

    // First filter 1d Horizontal
	//FilterBlock1d_hb8_wmt(SrcPtr, Intermediate, SrcPixelsPerLine, 1, 9, 8, HFilter );
	// Now filter Verticaly
	//FilterBlock1d_vb8_wmt(Intermediate, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);


}

 


/****************************************************************************
 * 
 *  ROUTINE       :     FilterBlockBil_8
 *  
 *  INPUTS        :     ReconPtr1, ReconPtr12
 *							Two pointers into the block of data to be filtered
 *							These pointers bound the fractional pel position
 *						PixelsPerLine
 *							Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
 *						Modx, ModY
 *							The fractional pel bits used to select a filter.
 *
 *				
 *  OUTPUTS       :     ReconRefPtr
 *							A pointer to an 8x8 buffer into which UINT8 filtered data is written.
 *
 *  RETURNS       :     None.
 *
 *  FUNCTION      :     Produces a bilinear filtered fractional pel prediction block
 *						with UINT8 output
 *
 *  SPECIAL NOTES :      
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
void FilterBlockBil_8_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT8 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY )
{
	int diff;

	// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
	diff=ReconPtr2-ReconPtr1;

	// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
	// This works out to be what we want... despite the pointer swapping that goes on below.
	// For example... if the X component of the vector is a +ve ModX = X%8.
	//                if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.

	if(diff<0) 
	{											// swap pointers so ReconPtr1 smaller
		UINT8 *temp=ReconPtr1;
		ReconPtr1=ReconPtr2;
		ReconPtr2=temp;
		diff= (int)(ReconPtr2-ReconPtr1);
	}

	if( diff==1 )
	{			
		FilterBlock1d_hb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
	}
	else if (diff == (int)(PixelsPerLine) )				// Fractional pixel in vertical only
	{
		FilterBlock1d_vb8_mmx(ReconPtr1, ReconRefPtr, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
	}
	else if(diff == (int)(PixelsPerLine - 1))			// ReconPtr1 is Top right
	{										
		FilterBlock2dBil_mmx( ReconPtr1-1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
	}
	else if(diff == (int)(PixelsPerLine + 1) )			// ReconPtr1 is Top left
	{	
		FilterBlock2dBil_mmx( ReconPtr1, ReconRefPtr, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
	}
}

/****************************************************************************
 * 
 *  ROUTINE       :     FilterBlock2d
 *  
 *  INPUTS        :     Pointer to source data
 *						
 *  OUTPUTS       :     Filtered data
 *
 *  RETURNS       :     None.
 *
 *  FUNCTION      :     Applies a 2d 4 tap filter on the intput data to produce
 *						a predictor block (UINT16)
 *
 *  SPECIAL NOTES :     
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
void FilterBlock2d_mmx( UINT8 *SrcPtr, UINT8 *OutputPtr, UINT32 SrcPixelsPerLine, INT16 * HFilter, INT16 * VFilter )
{

    UINT8 Intermediate[256];

	// First filter 1d Horizontal
	FilterBlock1d_h_mmx(SrcPtr-SrcPixelsPerLine, Intermediate, SrcPixelsPerLine, 1, 11, 8, HFilter );

	// Now filter Verticaly
	FilterBlock1d_v_mmx(Intermediate+BLOCK_HEIGHT_WIDTH, OutputPtr, BLOCK_HEIGHT_WIDTH, BLOCK_HEIGHT_WIDTH, 8, 8, VFilter);


}
 

/****************************************************************************
 * 
 *  ROUTINE       :     FilterBlock
 *  
 *  INPUTS        :     ReconPtr1, ReconPtr12
 *							Two pointers into the block of data to be filtered
 *							These pointers bound the fractional pel position
 *						PixelsPerLine
 *							Pixels per line in the buffer pointed to by ReconPtr1 & ReconPtr12
 *						Modx, ModY
 *							The fractional pel bits used to select a filter.
 *						UseBicubic
 *							Whether to use the bicubuc filter set or the bilinear set
 *
 *				
 *  OUTPUTS       :     ReconRefPtr
 *							A pointer to an 8x8 buffer into which the filtered data is written.
 *
 *  RETURNS       :     None.
 *
 *  FUNCTION      :     Produces a filtered fractional pel prediction block
 *						using bilinear or bicubic filters
 *
 *  SPECIAL NOTES :     
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
void FilterBlock_mmx( UINT8 *ReconPtr1, UINT8 *ReconPtr2, UINT16 *ReconRefPtr, UINT32 PixelsPerLine, INT32 ModX, INT32 ModY, BOOL UseBicubic, UINT8 BicubicAlpha )
{
	int diff;
    UINT8 Intermediate[256];

	// swap pointers so ReconPtr1 smaller (above, left, above-right or above-left )
	diff=ReconPtr2-ReconPtr1;

	// The ModX and ModY arguments are the bottom three bits of the signed motion vector components (at 1/8th pel precision).
	// This works out to be what we want... despite the pointer swapping that goes on below.
	// For example... if the X component of the vector is a +ve ModX = X%8.
	//                if the X component of the vector is a -ve ModX = 8+(X%8) where X%8 is in the range -7 to -1.

	if(diff<0) 
	{											// swap pointers so ReconPtr1 smaller
		UINT8 *temp=ReconPtr1;
		ReconPtr1=ReconPtr2;
		ReconPtr2=temp;
		diff= (int)(ReconPtr2-ReconPtr1);
	}

    if(!diff)
    {
        return;
    }
	if( diff==1 )
	{											        // Fractional pixel in horizontal only
		if ( UseBicubic )
			FilterBlock1d_h_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModX] );
		else
			FilterBlock1d_hb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, 1, 8, 8, BilinearFilters_mmx[ModX] );
	}
	else if (diff == (int)(PixelsPerLine) )				// Fractional pixel in vertical only
	{
		if ( UseBicubic )
			FilterBlock1d_v_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BicubicFilters_mmx[BicubicAlpha][ModY]);
		else
			FilterBlock1d_vb8_mmx(ReconPtr1, Intermediate, PixelsPerLine, PixelsPerLine, 8, 8, BilinearFilters_mmx[ModY]);
	}
	else if(diff == (int)(PixelsPerLine - 1))			// ReconPtr1 is Top right
	{										
		if ( UseBicubic )
			FilterBlock2d_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
		else
			FilterBlock2dBil_mmx( ReconPtr1-1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
	}
	else if(diff == (int)(PixelsPerLine + 1) )			// ReconPtr1 is Top left
	{	
		if ( UseBicubic )
			FilterBlock2d_mmx( ReconPtr1, Intermediate, PixelsPerLine, BicubicFilters_mmx[BicubicAlpha][ModX], BicubicFilters_mmx[BicubicAlpha][ModY] );
		else
			FilterBlock2dBil_mmx( ReconPtr1, Intermediate, PixelsPerLine, BilinearFilters_mmx[ModX], BilinearFilters_mmx[ModY] );
	}
    UnpackBlock_MMX( Intermediate, ReconRefPtr, 8 );
}


