A big part is explained into the PDF joint to this post.
I just would like to show the asm code with comments.
Quote.Model flat,fastcall
OPTION CSTRINGS:ON
INCLUDE TestSSE.inc
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
.Data
TableOfJumps QWORD OFFSET Modulo_0, Modulo_1, Modulo_2, Modulo_3 ;
.Code
; __dwInputImageSize = Width * Height
StartXmm PROC USES RBX RDI RSI, __dwInputImageSize:DWORD,__lpImage:LPBYTE,__lpMask:LPBYTE,__lpResult:LPBYTE PARMAREA=4*QWORD
MOV RSI,RDX
MOV RBX,R8
MOV RDI,R9
; Examples of values for __dwInputImageSize:DWORD
; Image Width : 661...668 pixels
; Image Height : 537 pixels
;1 080 x 1 301 = 1 405 080
;1 405 080 % 4 = 0
;--------------------------------------
; 1 085 x 1 301 = 1 411 585
;1 411 585 / 4 = 352 896
; 352 896 * 4 = 1 411 584
;1 411 585 - 1 411 584 = 1
;--------------------------------------
;1 087 * 1 301 = 1 414 187
;1 414 187 / 4 = 353 546
;353 546 * 4 = 1 414 184
;1 414 187 - 1 414 184 = 3
;--------------------------------------
; --------------------------------
; --- Result = Source + Masque ---
; --------------------------------
MOV EDX,ECX ; 1 085 * 1 301 = 1 411 585
PUSH RDX
SHR RDX,2 ; 1 411 585 Divided by 4 = 352 896
MOV ECX,EDX ; 352 896 XMMWORDS for the LOOP
SHL RDX,2 ; 352 896 Multiplied by 4 = 1 411 584
POP RAX
SUB RAX,RDX ; 1 411 585[ - 1 411 584 = 1
; (Possible values : 0, 1, 2, 3) => MODULO
MOV EDX,16
ALIGN 16 ; Generated NOPs are not executed, no need of JUMP
@Loop :
LDDQU XMM1,XMMWORD PTR [RSI] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [RBX] ; Saturate (IF < 0 Result = 0) AND
; (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [RDI],XMM1 ; Store result
ADD RSI,RDX ; Next color
ADD RDI,RDX ; Next destination
ADD RBX,RDX ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; ----------------------------------------------------
; -------------- Loop 128 bits finished --------------
; ----------------------------------------------------
MOV RDX,OFFSET TableOfJumps ; Table address
JMP [RDX + 8 * RAX] ; Address of Table + (8 * Modulo)
ALIGN 16
Modulo_3 :
; -----------------------------------------------------------------------------
; --- We have to add one extra QWORD and one extra DWORD (8 + 4 = 12 bytes) ---
; -----------------------------------------------------------------------------
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [RBX] ; Load 8 bytes
LDDQU XMM3,DWORD PTR [RSI + 8] ; Load 4 bytes
LDDQU XMM4,DWORD PTR [RBX + 8] ; Load 4 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
PADDUSB XMM3,XMM4 ; Add and saturates the DWORD
MOVQ QWORD PTR [RDI],XMM1 ; Store the result
MOVD DWORD PTR [RDI + 8],XMM3
JMP Modulo_0
ALIGN 16
Modulo_2 :
; --------------------------------------------------
; -------------- Just one extra QWORD --------------
; --------------------------------------------------
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [RBX] ; Load 8 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
MOVQ QWORD PTR [RDI],XMM1 ; Store the result
JMP Modulo_0
ALIGN 16 ; Generated NOPs are not executed, no need of JUMP
Modulo_1 :
; --------------------------------------------------
; -------------- Just one extra DWORD --------------
; --------------------------------------------------
LDDQU XMM1,DWORD PTR [RSI] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [RBX] ; Add and saturates the DWORD
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [RDI],XMM1
ALIGN 16 ; Generated NOPs are not executed, no need of JUMP
Modulo_0 :
EMMS ; Release XMM registers
SFENCE ; All writes are done
ret ; Bye
StartXmm ENDP
END
In this new version I have tried to optimize the code but I am not very pleased because the first lines cannot be executed in parallel.
I have removed the code "SFENCE" it was for old ATHLON processor.
I compute the adresses the go at the beginning of the sub-routine, like this I have not to compute datas taht I have before.
Now I only have "JMP [RAX]"
Datas and Code are aligned on 16 bytes boundaries because the linker can rearrange them in that case the aligment is worng.
Quote .Model flat,fastcall
OPTION CSTRINGS:ON
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
.Data
ALIGN 16
TableOfJumps QWORD OFFSET @Modulo_0, @Modulo_1, @Modulo_2, @Modulo_3
.Code
ALIGN 16
; __dwInputImageSize = Width * Height
Fusion PROC __dwInputImageSize:DWORD,__lpImage:QWORD,__lpMask:QWORD,__lpResult:QWORD
; --------------------------------
; --- Result = Source + Masque ---
; --------------------------------
PUSH RDX
MOV EDX,ECX ; Image Size (Width * Height)
PUSH RDX
SHR RDX,2 ; Image Size Divided by 4
MOV ECX,EDX ; Number XMMWORDS for the LOOP
SHL RDX,2 ; Compute remaining bytes
POP RAX
SUB RAX,RDX ; Compute the Modulo (Possible values : 0, 1, 2, 3)
MOV RDX,OFFSET TableOfJumps ; Table of sub-routines addresses
LEA RAX,[RDX + 8 * RAX] ; Addresses of sub-routine in function of Modulo
POP RDX
ALIGN 16
@Loop :
MOVDQA XMM1,XMMWORD PTR [RDX] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [R8] ; Saturate (IF < 0 Result = 0) AND
; (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [R9],XMM1 ; Store result
; Non Temporal MOV. Don't use the cache
ADD RDX,16 ; Next color
ADD R9,16 ; Next destination
ADD R8,16 ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; ----------------------------------------------------
; -------------- Loop 128 bits finished --------------
; ----------------------------------------------------
JMP [RAX] ; Jump to Modulo sub-routine
ALIGN 16
@Modulo_3 :
; -----------------------------------------------------------------------------
; --- We have to add one extra QWORD and one extra DWORD (8 + 4 = 12 bytes) ---
; -----------------------------------------------------------------------------
LDDQU XMM1,DWORD PTR [RDX + 8] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [R8 + 8] ; Load 4 bytes
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [R9 + 8],XMM3
ALIGN 16
@Modulo_2 :
; --------------------------------------------------
; -------------- Just one extra QWORD --------------
; --------------------------------------------------
LDDQU XMM1,QWORD PTR [RDX] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [R8] ; Load 8 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
MOVQ QWORD PTR [R9],XMM1 ; Store the result
JMP @Modulo_0
ALIGN 16
@Modulo_1 :
; --------------------------------------------------
; -------------- Just one extra DWORD --------------
; --------------------------------------------------
LDDQU XMM1,DWORD PTR [RDX] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [R8] ; Add and saturates the DWORD
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [R9],XMM1
ALIGN 16
@Modulo_0 :
EMMS ; Release XMM registers
ret ; Bye
Fusion ENDP
END