I have the following code
Image BYTE 1424124 dup(?)
Masque BYTE 1424124 dup(?)
Result BYTE 1424124 dup(?)
.Code
Start PROC USES RBX RDI RSI PARMAREA=4*QWORD
LOCAL _dwDelta_8:DWORD
LOCAL _dwDelta_4:DWORD
LOCAL __dwInputImageSize:DWORD
MOV RDI,OFFSET Result
MOV RSI,OFFSET Image
MOV RBX,OFFSET Masque
; Image Width : 661...668 pixels
; Image Height : 537 pixels
; A x 4 B x H C / 16 D * 16 C - E F / 8 E + (8 x G) C - H I / 4 H + (4 x J)
; A B C D E F G H I J K L
; 661 2644 1419828 88739 1419824 4 0 1419824 4 1 1419828 DWORD
; 662 2648 1421976 88873 1421968 8 1 1421976 0 0 1421976 QWORD
; 663 2652 1424124 89007 1424112 12 1 1424120 4 1 1424124 QWORD + DWORD
; 664 2656 1426272 89142 1426272 0 0 1426272 0 0 1426272
; 665 2660 1428420 89276 1428416 4 0 1428416 4 1 1428420 DWORD
; 666 2664 1430568 89410 1430560 8 1 1430568 0 0 1430568 QWORD
; 667 2668 1432716 89544 1432704 12 1 1432712 4 1 1432716 QWORD + DWORD
; 668 2672 1434864 89679 1434864 0 0 1434864 0 0 1434864
; Result = Source + Masque
MOV _dwDelta_8,0
MOV _dwDelta_4,0
MOV EAX,__dwInputImageSize ; 663 * 4 * 537 = 1424124
MOV EDX,EAX
SHR EAX,4 ; 1424124 Divided by 16 = 89007
MOV ECX,EAX ; 89007 XMMWORDS
SHL EAX,4 ; 89007 Multiplied by 16 = 1424112
SUB EDX,EAX ; 1424124 - 1424112 = 12 (Possible values : 0, 4, 8, 12)
JZ Loop_0 ; The number (W * 4) * H is multiple of 16
CMP EDX,8
JL @Delta_4
SUB EDX,8 ; 12 - 8 = 4 => 1 DWORD
MOV _dwDelta_8,1 ; 1 QWORD
@Delta_4 :
CMP EDX,4
JNE @Loop_0
MOV _dwDelta_4,1 ; 1 DWORD
Loop_0 :
MOV RDX,16
ALIGN 16
@Loop :
LDDQU XMM1,XMMWORD PTR [RSI] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [RBX] ; Saturate (IF < 0 Result = 0) AND (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [RDI],XMM1 ; Store result
ADD RSI,RDX ; Next color
ADD RDI,RDX ; Next destination
ADD RBX,RDX ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; -------------- Loop 128 bits finished --------------
CMP _dwDelta_8,1
JNE @Loop_4
; -------------- Get 8 Bytes --------------
SHR RDX,1 ; RDX = 8
XORPD XMM1,XMM1 ; XMM1 = 0
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
PADDUSB XMM1,QWORD PTR [RBX] ; Add and Saturate
MOVDQ2Q MM1,XMM1 ; Move XMM1 to MM1 POASM says 'error: Invalid instruction operand.'
MOVNTQ QWORD PTR [RDI],MM1 ; Store the result : MM1 => Memory
ADD RSI,RDX ; Next color
ADD RDI,RDX ; Next destination
ADD RBX,RDX ; Next mask
@Loop_4 :
CMP _dwDelta_4,1
JNE @Loop_0 ; If it is equal to 0, it is finished
; -------------- Traitement de 4 octets --------------
LDDQU XMM1,DWORD PTR [RSI]
; PADDUSB XMM1,DWORD PTR [RBX]
; MOVD DWORD PTR [RBX],XMM1
@Loop_0 :
EMMS ; Release XMM registers
SFENCE ; All writes are done
ret ; Bye
My problem is to move 64 bits from XMM1 to MM1.
But PoAsm does not want!
MOVDQ2QMM1,XMM1; Move XMM1 to MM1POASM says 'error: Invalid instruction operand.'
If someone could help me I would be very happy
I continue to search and I continue to not find the solution. I can't make a move with a QWORD or a DWORD using XMM registers!
I am learning SSE... Not very easy.
Here is my new code:
Quote.Model flat,fastcall
OPTION CSTRINGS:ON
INCLUDE Win_Constantes.inc
INCLUDE Win_Typedefs.inc
INCLUDE Win_Structures.inc
INCLUDE Win_Functions.inc
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
.Data?
lpImage LPBYTE ? ; = QWORD
lpMask LPBYTE ? ; = QWORD
lpResult LPBYTE ? ; = QWORD
.Code
Start PROC USES RBX RDI RSI, __dwInputImageSize:DWORD PARMAREA=4*QWORD
MOV RDI,[lpResult + RIP] ; Has just been allocated. Should contain 0.
MOV RSI,[lpImage + RIP] ; I suppose the buffers are filled in
MOV RBX,[lpMask + RIP] ; With image data
; Examples of values for __dwInputImageSize:DWORD
; Image Width : 661...668 pixels
; Image Height : 537 pixels
; A x 4 B x H C / 16 D * 16 C - E F / 8 E + (8 x G) C - H I / 4 H + (4 x J)
; A B C D E F G H I J K L
; 661 2644 1419828 88739 1419824 4 0 1419824 4 1 1419828 DWORD
; 662 2648 1421976 88873 1421968 8 1 1421976 0 0 1421976 QWORD
; 663 2652 1424124 89007 1424112 12 1 1424120 4 1 1424124 QWORD + DWORD
; 664 2656 1426272 89142 1426272 0 0 1426272 0 0 1426272
; 665 2660 1428420 89276 1428416 4 0 1428416 4 1 1428420 DWORD
; 666 2664 1430568 89410 1430560 8 1 1430568 0 0 1430568 QWORD
; 667 2668 1432716 89544 1432704 12 1 1432712 4 1 1432716 QWORD + DWORD
; 668 2672 1434864 89679 1434864 0 0 1434864 0 0 1434864
; Result = Source + Masque
MOV EDX,__dwInputImageSize ; 663 * 4 * 537 = 1424124
PUSH RDX
SHR RDX,4 ; 1424124 Divided by 16 = 89007
MOV ECX,EDX ; 89007 XMMWORDS for the LOOP
SHL RDX,4 ; 89007 Multiplied by 16 = 1424112
POP RAX
SUB RAX,RDX ; 1424124 - 1424112 = 12 (Possible values : 0, 4, 8, 12)
MOV EDX,16
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Loop :
LDDQU XMM1,XMMWORD PTR [RSI] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [RBX] ; Saturate (IF < 0 Result = 0) AND (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [RDI],XMM1 ; Store result
ADD RSI,RDX ; Next color
ADD RDI,RDX ; Next destination
ADD RBX,RDX ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; -------------- Loop 128 bits finished --------------
TEST EAX,EAX ; No extra QWORD and DWORD
JZ @Extra_0
SUB EAX,4
JZ @Extra_4
SUB EAX,8
JZ @Extra_8
; --- We have to add one extra QWORD and one extra DWORD (8 + 4 = 12 bytes) ---
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [RBX] ; Load 8 bytes
LDDQU XMM3,DWORD PTR [RSI + 8] ; Load 4 bytes
LDDQU XMM4,DWORD PTR [RBX + 8] ; Load 4 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
PADDUSB XMM3,XMM4 ; Add and saturates the DWORD
MOVQ QWORD PTR [RDI],XMM1 ; Store the result
MOVD DWORD PTR [RDI + 8],XMM3 ; error: Invalid instruction operand.
JMP @Extra_0
ALIGN 16
@Extra_8 :
; -------------- Just one extra QWORD --------------
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
PADDUSB XMM1,QWORD PTR [RBX] ; Add and Saturate
MOVQ QWORD PTR [RDI],XMM1 ; Store the result --- error: Invalid instruction operand.
JMP @Extra_0
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Extra_4 :
; -------------- Just one extra DWORD --------------
LDDQU XMM1,DWORD PTR [RSI] ; Load 4 bytes
PADDUSB XMM1,DWORD PTR [RBX] ; Add and saturates the DWORD
MOVD DWORD PTR [RDI],XMM1 ; error: Invalid instruction operand.
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Extra_0 :
EMMS ; Release XMM registers
SFENCE ; All writes are done
ret ; Bye
Start ENDP
; Finished
; Finished
END
I really need some help.
Now the problem is solved
Here is the solution
Quote .Model flat,fastcall
OPTION CSTRINGS:ON
INCLUDE TestSSE.inc
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
; ---------------------------------------------------------------------------
.Data?
lpImage LPBYTE ? ; = QWORD
ALIGN 16
lpMask LPBYTE ? ; = QWORD
ALIGN 16
lpResult LPBYTE ? ; = QWORD
.Code
StartXmm PROC USES RBX RDI RSI, __dwInputImageSize:DWORD,__lpImage:LPBYTE,__lpMask:LPBYTE,__lpResult:LPBYTE PARMAREA=4*QWORD
MOV RSI,RDX
MOV RBX,R8
MOV RDI,R9
; Examples of values for __dwInputImageSize:DWORD
; Image Width : 661...668 pixels
; Image Height : 537 pixels
; A x 4 B x H C / 16 D * 16 C - E F / 8 E + (8 x G) C - H I / 4 H + (4 x J)
; A B C D E F G H I J K L
; 661 2644 1419828 88739 1419824 4 0 1419824 4 1 1419828 DWORD
; 662 2648 1421976 88873 1421968 8 1 1421976 0 0 1421976 QWORD
; 663 2652 1424124 89007 1424112 12 1 1424120 4 1 1424124 QWORD + DWORD
; 664 2656 1426272 89142 1426272 0 0 1426272 0 0 1426272
; 665 2660 1428420 89276 1428416 4 0 1428416 4 1 1428420 DWORD
; 666 2664 1430568 89410 1430560 8 1 1430568 0 0 1430568 QWORD
; 667 2668 1432716 89544 1432704 12 1 1432712 4 1 1432716 QWORD + DWORD
; 668 2672 1434864 89679 1434864 0 0 1434864 0 0 1434864
; --------------------------------
; --- Result = Source + Masque ---
; --------------------------------
MOV EDX,ECX ; 663 * 4 * 537 = 1424124
PUSH RDX
SHR RDX,4 ; 1424124 Divided by 16 = 89007
MOV ECX,EDX ; 89007 XMMWORDS for the LOOP
SHL RDX,4 ; 89007 Multiplied by 16 = 1424112
POP RAX
SUB RAX,RDX ; 1424124 - 1424112 = 12 (Possible values : 0, 4, 8, 12)
MOV EDX,16
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Loop :
LDDQU XMM1,XMMWORD PTR [RSI] ; Load 16 bytes
PADDUSB XMM1,XMMWORD PTR [RBX] ; Saturate (IF < 0 Result = 0) AND (IF > 255 Result = 255)
MOVNTDQ XMMWORD PTR [RDI],XMM1 ; Store result
ADD RSI,RDX ; Next color
ADD RDI,RDX ; Next destination
ADD RBX,RDX ; Next mask
SUB ECX,1
JNZ @Loop ; Next loop if ECX > 0
; ----------------------------------------------------
; -------------- Loop 128 bits finished --------------
; ----------------------------------------------------
TEST EAX,EAX ; No extra QWORD and DWORD
JZ @Extra_0
CMP EAX,4
JE @Extra_4
CMP EAX,8
JE @Extra_8
; -----------------------------------------------------------------------------
; --- We have to add one extra QWORD and one extra DWORD (8 + 4 = 12 bytes) ---
; -----------------------------------------------------------------------------
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [RBX] ; Load 8 bytes
LDDQU XMM3,DWORD PTR [RSI + 8] ; Load 4 bytes
LDDQU XMM4,DWORD PTR [RBX + 8] ; Load 4 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
PADDUSB XMM3,XMM4 ; Add and saturates the DWORD
MOVQ QWORD PTR [RDI],XMM1 ; Store the result
MOVD DWORD PTR [RDI + 8],XMM3
JMP @Extra_0
ALIGN 16
@Extra_8 :
; --------------------------------------------------
; -------------- Just one extra QWORD --------------
; --------------------------------------------------
LDDQU XMM1,QWORD PTR [RSI] ; Load 8 bytes
LDDQU XMM2,QWORD PTR [RBX] ; Load 8 bytes
PADDUSB XMM1,XMM2 ; Add and Saturate
MOVQ QWORD PTR [RDI],XMM1 ; Store the result
JMP @Extra_0
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Extra_4 :
; --------------------------------------------------
; -------------- Just one extra DWORD --------------
; --------------------------------------------------
LDDQU XMM1,DWORD PTR [RSI] ; Load 4 bytes
LDDQU XMM2,DWORD PTR [RBX] ; Add and saturates the DWORD
PADDUSB XMM1,XMM2 ; Add and saturates the DWORD
MOVD DWORD PTR [RDI],XMM1
ALIGN 16 ; Generated NOPs are not executed, no need JUMP
@Extra_0 :
EMMS ; Release XMM registers
SFENCE ; All writes are done
ret ; Bye
StartXmm ENDP
END
Here is the text image : F0.jpg
It must be in the same folder than the program