nuttx/libs/libc/machine/x86_64/gnu/arch_memset_sse2.S
p-szafonimateusz e6553eee5a libc/x86_64: port string functions from bionic
port optimized string functions for x86_64 from Bionic (BSD licensed)

Signed-off-by: p-szafonimateusz <p-szafonimateusz@xiaomi.com>
2024-07-02 23:59:18 +08:00

167 lines
4.3 KiB
ArmAsm

/*********************************************************************************
* libs/libc/machine/x86_64/gnu/arch_memset_sse2.S
*
* Copyright (c) 2014, Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* * this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* * this list of conditions and the following disclaimer in the documentation
* * and/or other materials provided with the distribution.
*
* * Neither the name of Intel Corporation nor the names of its contributors
* * may be used to endorse or promote products derived from this software
* * without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*********************************************************************************/
/********************************************************************************
* Included Files
*********************************************************************************/
#include "cache.h"
/*********************************************************************************
* Pre-processor Definitions
*********************************************************************************/
#ifndef L
# define L(label) .L##label
#endif
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
#define ENTRY(__f) \
.text; \
.global __f; \
.balign 16; \
.type __f, @function; \
__f: \
.cfi_startproc;
#define END(__f) \
.cfi_endproc; \
.size __f, .- __f;
/*********************************************************************************
* Public Functions
*********************************************************************************/
.section .text.sse2,"ax",@progbits
ENTRY(memset)
movq %rdi, %rax
and $0xff, %rsi
mov $0x0101010101010101, %rcx
imul %rsi, %rcx
cmpq $16, %rdx
jae L(16bytesormore)
testb $8, %dl
jnz L(8_15bytes)
testb $4, %dl
jnz L(4_7bytes)
testb $2, %dl
jnz L(2_3bytes)
testb $1, %dl
jz L(return)
movb %cl, (%rdi)
L(return):
ret
L(8_15bytes):
movq %rcx, (%rdi)
movq %rcx, -8(%rdi, %rdx)
ret
L(4_7bytes):
movl %ecx, (%rdi)
movl %ecx, -4(%rdi, %rdx)
ret
L(2_3bytes):
movw %cx, (%rdi)
movw %cx, -2(%rdi, %rdx)
ret
ALIGN (4)
L(16bytesormore):
movd %rcx, %xmm0
pshufd $0, %xmm0, %xmm0
movdqu %xmm0, (%rdi)
movdqu %xmm0, -16(%rdi, %rdx)
cmpq $32, %rdx
jbe L(32bytesless)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, -32(%rdi, %rdx)
cmpq $64, %rdx
jbe L(64bytesless)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movdqu %xmm0, -64(%rdi, %rdx)
movdqu %xmm0, -48(%rdi, %rdx)
cmpq $128, %rdx
ja L(128bytesmore)
L(32bytesless):
L(64bytesless):
ret
ALIGN (4)
L(128bytesmore):
leaq 64(%rdi), %rcx
andq $-64, %rcx
movq %rdx, %r8
addq %rdi, %rdx
andq $-64, %rdx
cmpq %rcx, %rdx
je L(return)
#ifdef SHARED_CACHE_SIZE
cmp $SHARED_CACHE_SIZE, %r8
#else
cmp __x86_64_shared_cache_size(%rip), %r8
#endif
ja L(128bytesmore_nt)
ALIGN (4)
L(128bytesmore_normal):
movdqa %xmm0, (%rcx)
movaps %xmm0, 0x10(%rcx)
movaps %xmm0, 0x20(%rcx)
movaps %xmm0, 0x30(%rcx)
addq $64, %rcx
cmpq %rcx, %rdx
jne L(128bytesmore_normal)
ret
ALIGN (4)
L(128bytesmore_nt):
movntdq %xmm0, (%rcx)
movntdq %xmm0, 0x10(%rcx)
movntdq %xmm0, 0x20(%rcx)
movntdq %xmm0, 0x30(%rcx)
leaq 64(%rcx), %rcx
cmpq %rcx, %rdx
jne L(128bytesmore_nt)
sfence
ret
END(memset)