What do you think about thses functions :
.Const
INCLUDE "AsmView.inc"
; ----------------------------------------------------------------------------------
; ----------------------------------------------------------------------------------
; ----------------------------------------------------------------------------------
.Code
; __________________________________________________________________________________
; _______________________ memcpy ___________________________________________________
; __________________________________________________________________________________
comment @
RCX = Destination
RDX = Source
R8 = Number of bytes to copy
@
memcpy PROC
ret
memcpy ENDP
; __________________________________________________________________________________
; _______________________ memset ___________________________________________________
; __________________________________________________________________________________
comment^
INPUT RCX = Source
RDX = Number of bytes to copy
R8 = Char to copy to RCX
OUTPUT RAX = Source
^
PUBLIC memset
memset :
push rdi
mov rdi,rcx
push rdi
and r8,00000000000000ffh ; R8 = 000000000000000@
mov rax,r8 ; RAX = 000000000000000@
mov ah,al ; RAX = 00000000000000@@
mov r8,rax ; R8 = 00000000000000@@
shl rax,16 ; RAX = 000000000000@@00
or rax,r8 ; RAX = 000000000000@@@@
mov r8,rax ; R8 = 000000000000@@@@
shl rax,32 ; RAX = 00000000@@@@0000
or rax,r8 ; RAX = 00000000@@@@@@@@
mov r8,rax ; R8 = 00000000@@@@@@@@
mov rdi,rcx
movd xmm0,rax ; XMM0 = 00000000@@@@@@@@
shufpd xmm0,xmm0,0 ; XMM0 = @@@@@@@@@@@@@@@@
; ------------------> Move 128 bits
mov rcx,rdx
shr rcx,4
jrcxz @memset_Loop_128_End
mov r10,rcx
mov rax,16
jmp @memset_Loop_128
ALIGN 16
@memset_Loop_128 :
movdqu [rdi],xmm0
add rdi,rax
dec rcx
jnz @memset_Loop_128
shl r10,4
sub rdx,r10
jz @Finished_memset
jmp @memset_Loop_128_End
ALIGN 16
@memset_Loop_128_End :
; ------------------> Move 64 bits
mov rcx,rdx
shr rcx,3
jrcxz @memset_Loop_64_End
mov r10,rcx
shr rax,1
jmp @memset_Loop_64
ALIGN 16
@memset_Loop_64 :
movd [rdi],xmm0
add rdi,rax
dec rcx
jnz @memset_Loop_64
shl r10,3
sub rdx,r10
jz @Finished_memset
jmp @memset_Loop_64_End
ALIGN 16
@memset_Loop_64_End :
; ------------------> Move 32 bits
mov rcx,rdx
shr rcx,2
mov r10,rcx
mov rax,r8
rep stosd
shl r10,2
sub rdx,r10
jz @Finished_memset
; ------------------> Move 16 bits
mov rcx,rdx
shr rcx,1
mov r10,rcx
rep stosw
shl r10,1
sub rdx,r10
jz @Finished_memset
; ------------------> Move 8 bits
stosb
@Finished_memset :
pop rax
pop rdi
ret
; __________________________________________________________________________________
; _______________________ memset ___________________________________________________
; __________________________________________________________________________________
comment^
INPUT RCX = Source
RDX = Number of bytes to copy
OUTPUT RAX = Source
^
PUBLIC memset0
memset0 :
push rdi
mov rdi,rcx
push rdi
xor rax,rax
xorpd xmm0,xmm0
shufpd xmm0,xmm0,0
mov rcx,rdx
shr rcx,4
mov rax,16
mov r10,rcx
jrcxz @memset0_Loop_128_End
jmp @memset0_Loop_128
; ------------------> Move 128 bits
ALIGN 16
@memset0_Loop_128 :
movdqu [rdi],xmm0
add rdi,rax
dec rcx
jnz @memset0_Loop_128
jmp @memset0_Loop_128_End
ALIGN 16
@memset0_Loop_128_End :
; ------------------> Move 64 bits
shl r10,4
sub rdx,r10
jz @Finished_memset0
mov rcx,rdx
shr rcx,3
mov r10,rcx
rep stosq
shl r10,3
sub rdx,r10
jz @Finished_memset0
; ------------------> Move 32 bits
mov rcx,rdx
shr rcx,2
mov r10,rcx
rep stosd
shl r10,2
sub rdx,r10
jz @Finished_memset0
; ------------------> Move 16 bits
mov rcx,rdx
shr rcx,1
mov r10,rcx
rep stosw
shl r10,1
sub rdx,r10
jz @Finished_memset0
; ------------------> Move 8 bits
stosb
jmp @Finished_memset0
@Finished_memset0 :
pop rax
pop rdi
ret