 /****************************************************************************
 *
 *   Module Title :     WmtOptFunctions.c
 *
 *   Description  :     willamette processor specific 
 *                      optimised versions of functions
 *
 *   AUTHOR      :		Yaowu Xu
 *
 *	 Special Note:		
 *
 *****************************************************************************
 *   Revision History
 *
 *
 *  1.04 JBB 13 Jun 01 VP4 Code Clean Out
 *   1.03 YWX 07-Dec-00 Removed constants and functions that are not in use
 * 			Added push and pop ebx in WmtReconIntra
 *   1.02 YWX 30 Aug 00 changed to be compatible with Microsoft compiler
 *   1.01 YWX 13 JUL 00 New Willamette Optimized Functions
 *   1.00 YWX 14/06/00  Configuration baseline from OptFunctions.c
 *
 *****************************************************************************
 */
 
/* 
    Use Tim's optimized version.
*/

/****************************************************************************
 *  Header Files
 *****************************************************************************
 */

#define STRICT              // Strict type checking. 

#include "codec_common.h"

#include "pbdll.h"

/****************************************************************************
 *  Module constants.
 *****************************************************************************
 */        

/**************************************************************************** 
 *  Imports.
 *****************************************************************************
 */   


/****************************************************************************
 *  Exported Global Variables
 *****************************************************************************
 */

/****************************************************************************
 *  Exported Functions 
 *****************************************************************************
 */              

/****************************************************************************
 *  Module Statics
 *****************************************************************************
 */  



_declspec(align(16)) static  UINT8 Eight128s[8] =  {128,128,128,128,128,128,128,128};

#pragma warning( disable : 4799 )  // Disable no emms instruction warning!
                                      
/****************************************************************************
*  Forward References
*****************************************************************************
*/  


/****************************************************************************
 * 
 *  ROUTINE       :     WmtReconIntra
 *
 *  INPUTS        :     INT16 *  idct
 *                               Pointer to the output from the idct for this block
 *
 *                      UINT32   stride
 *                               Line Length in pixels in recon and reference images
 *                               
 *
 *                     
 *
 *  OUTPUTS       :     UINT8 *  dest
 *                               The reconstruction buffer
 *
 *  RETURNS       :     None
 *
 *  FUNCTION      :     Reconstructs an intra block - wmt version
 *
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
void WmtReconIntra( PB_INSTANCE *pbi, UINT8 * dest, INT16 * idct, INT32 stride )
{
    __asm
    {
	
		push		ebx

        mov         eax,[idct]						; Signed 16 bit inputs
        mov         edx,[dest]						; Unsigned 8 bit outputs

        movq		xmm0,QWORD PTR [Eight128s]		; Set xmm0 to 0x000000000000008080808080808080
		pxor		xmm3, xmm3						; set xmm3 to 0
													;
        mov         ebx,[stride]					; Line stride in output buffer
        lea         ecx,[eax+128]					; Endpoint in input buffer

loop_label:                                 

        movdqa		xmm2,XMMWORD PTR [eax]			; Read the eight inputs
		packsswb	xmm2,xmm3						;		
		
		pxor        xmm2,xmm0						; Convert result to unsigned (same as add 128)
        lea         eax,[eax + 16]					; Step source buffer

        cmp         eax,ecx							; are we done
        movq		QWORD PTR [edx],xmm2			; store results

        lea         edx,[edx+ebx]					; Step output buffer
        jc          loop_label						; Loop back if we are not done

		pop			ebx
    }

}

/****************************************************************************
 * 
 *  ROUTINE       :     WmtReconInter
 *
 *  INPUTS        :     UINT8 *  RefPtr
 *                               The last frame reference
 *
 *                      INT16 *  ChangePtr
 *                               Pointer to the change data
 *
 *                      UINT32   LineStep
 *                               Line Length in pixels in recon and ref images
 *
 *  OUTPUTS       :     UINT8 *  ReconPtr
 *                               The reconstruction
 *
 *  RETURNS       :     None
 *
 *  FUNCTION      :     Reconstructs data from last data and change
 *
 *  SPECIAL NOTES :     
 *
 *
 *  ERRORS        :     None.
 *
 ****************************************************************************/
void WmtReconInter( PB_INSTANCE *pbi, UINT8 * ReconPtr, UINT8 * RefPtr, INT16 * ChangePtr, UINT32 LineStep )
{
    (void) pbi;

 _asm {
		push	edi
		
		mov		ebx, [RefPtr]
		mov		ecx, [ChangePtr]

		mov		eax, [ReconPtr]
		mov		edx, [LineStep]

		pxor	xmm0, xmm0
		lea		edi, [ecx + 128]
  L:
		movq	xmm2, QWORD ptr [ebx]		; (+3 misaligned) 8 reference pixels
		movdqa	xmm4, XMMWORD ptr [ecx]		; 8 changes
		
		punpcklbw xmm2, xmm0				; 

		add	ebx, edx						; next row of reference pixels
		paddsw	xmm2, xmm4					; add in first 4 changes

		lea		ecx, [ecx + 16]				; next row of changes
		packuswb xmm2, xmm0					; pack result to unsigned 8-bit values

		cmp		ecx, edi					; are we done?
		movq	QWORD PTR [eax], xmm2		; store result

		lea		eax, [eax+edx]				; next row of output
		jc		L							; 12c / 8 elts = 18c / 8 pixels = 2.25 c/pix

		pop		edi
 }

}




