In this new version I have tried to optimize the code but I am not very pleased because the first lines cannot be executed in parallel.
I have removed the code "SFENCE" it was for old ATHLON processor.
I compute the adresses the go at the beginning of the sub-routine, like this I have not to compute datas taht I have before.
Now I only have "JMP [RAX]"
Datas and Code are aligned on 16 bytes boundaries because the linker can rearrange them in that case the aligment is worng.
.Model flat,fastcall
OPTION CSTRINGS:ON
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
.Data
ALIGN 16
TableOfJumps QWORD OFFSET @Modulo_0, @Modulo_1, @Modulo_2, @Modulo_3
.Code
ALIGN 16
; __dwInputImageSize = Width * Height
Fusion PROC __dwInputImageSize:DWORD,__lpImage:QWORD,__lpMask:QWORD,__lpResult:QWORD
; --------------------------------
; --- Result = Source + Masque ---
; --------------------------------
PUSH RDX
MOV EDX,ECX ; Image Size (Width * Height)
PUSH RDX
SHR RDX,2 ; Image Size Divided by 4
MOV ECX,EDX ; Number XMMWORDS for the LOOP
SHL RDX,2 ; Compute remaining bytes
POP RAX
SUB RAX,RDX ; Compute the Modulo (Possible values : 0, 1, 2, 3)
MOV RDX,OFFSET TableOfJumps ; Table of sub-routines addresses
LEA RAX,[RDX + 8 * RAX] ; Addresses of sub-routine in function of Modulo
POP RDX
ALIGN 16
@Loop :
MOVDQA XMM1,XMMWORD PTR [RDX] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [R8] ; Saturate (IF < 0 Result = 0) AND
; (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [R9],XMM1 ; Store result
; Non Temporal MOV. Don't use the cache
ADD RDX,16 ; Next color
ADD R9,16 ; Next destination
ADD R8,16 ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; ----------------------------------------------------
; -------------- Loop 128 bits finished --------------
; ----------------------------------------------------
JMP [RAX] ; Jump to Modulo sub-routine
ALIGN 16
@Modulo_3 :
; -----------------------------------------------------------------------------
; --- We have to add one extra QWORD and one extra DWORD (8 + 4 = 12 bytes) ---
; -----------------------------------------------------------------------------
LDDQU XMM1,DWORD PTR [RDX + 8] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [R8 + 8] ; Load 4 bytes
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [R9 + 8],XMM3
ALIGN 16
@Modulo_2 :
; --------------------------------------------------
; -------------- Just one extra QWORD --------------
; --------------------------------------------------
LDDQU XMM1,QWORD PTR [RDX] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [R8] ; Load 8 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
MOVQ QWORD PTR [R9],XMM1 ; Store the result
JMP @Modulo_0
ALIGN 16
@Modulo_1 :
; --------------------------------------------------
; -------------- Just one extra DWORD --------------
; --------------------------------------------------
LDDQU XMM1,DWORD PTR [RDX] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [R8] ; Add and saturates the DWORD
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [R9],XMM1
ALIGN 16
@Modulo_0 :
EMMS ; Release XMM registers
ret ; Bye
Fusion ENDP
END