nuttx/libs/libc/machine/x86_64/gnu/arch_memset_avx2.S
p-szafonimateusz e6553eee5a libc/x86_64: port string functions from bionic
port optimized string functions for x86_64 from Bionic (BSD licensed)

Signed-off-by: p-szafonimateusz <p-szafonimateusz@xiaomi.com>
2024-07-02 23:59:18 +08:00

180 lines
4.7 KiB
ArmAsm

/*********************************************************************************
* libs/libc/machine/x86_64/gnu/arch_memset_avx2.S
*
* Copyright (c) 2014, Intel Corporation
* All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions are met:
*
* * Redistributions of source code must retain the above copyright notice,
* * this list of conditions and the following disclaimer.
*
* * Redistributions in binary form must reproduce the above copyright notice,
* * this list of conditions and the following disclaimer in the documentation
* * and/or other materials provided with the distribution.
*
* * Neither the name of Intel Corporation nor the names of its contributors
* * may be used to endorse or promote products derived from this software
* * without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*
*********************************************************************************/
/********************************************************************************
* Included Files
*********************************************************************************/
#include "cache.h"
/*********************************************************************************
* Pre-processor Definitions
*********************************************************************************/
#ifndef L
# define L(label) .L##label
#endif
#ifndef ALIGN
# define ALIGN(n) .p2align n
#endif
#define ENTRY(__f) \
.text; \
.global __f; \
.balign 16; \
.type __f, @function; \
__f: \
.cfi_startproc;
#define END(__f) \
.cfi_endproc; \
.size __f, .- __f;
/*********************************************************************************
* Public Functions
*********************************************************************************/
.section .text.avx2,"ax",@progbits
ENTRY(memset)
movq %rdi, %rax
and $0xff, %rsi
mov $0x0101010101010101, %rcx
imul %rsi, %rcx
cmpq $16, %rdx
jae L(16bytesormore)
testb $8, %dl
jnz L(8_15bytes)
testb $4, %dl
jnz L(4_7bytes)
testb $2, %dl
jnz L(2_3bytes)
testb $1, %dl
jz 1f
movb %cl, (%rdi)
1: ret
L(8_15bytes):
movq %rcx, (%rdi)
movq %rcx, -8(%rdi, %rdx)
ret
L(4_7bytes):
movl %ecx, (%rdi)
movl %ecx, -4(%rdi, %rdx)
ret
L(2_3bytes):
movw %cx, (%rdi)
movw %cx, -2(%rdi, %rdx)
ret
ALIGN (4)
L(16bytesormore):
movd %rcx, %xmm0
pshufd $0, %xmm0, %xmm0
movdqu %xmm0, (%rdi)
movdqu %xmm0, -16(%rdi, %rdx)
cmpq $32, %rdx
jbe L(done)
movdqu %xmm0, 16(%rdi)
movdqu %xmm0, -32(%rdi, %rdx)
cmpq $64, %rdx
jbe L(done)
movdqu %xmm0, 32(%rdi)
movdqu %xmm0, 48(%rdi)
movdqu %xmm0, -64(%rdi, %rdx)
movdqu %xmm0, -48(%rdi, %rdx)
cmpq $128, %rdx
jbe L(done)
vpbroadcastb %xmm0, %ymm0
vmovdqu %ymm0, 64(%rdi)
vmovdqu %ymm0, 96(%rdi)
vmovdqu %ymm0, -128(%rdi, %rdx)
vmovdqu %ymm0, -96(%rdi, %rdx)
cmpq $256, %rdx
jbe L(done)
ALIGN (4)
leaq 128(%rdi), %rcx
andq $-128, %rcx
movq %rdx, %r8
addq %rdi, %rdx
andq $-128, %rdx
cmpq %rcx, %rdx
je L(done)
#ifdef SHARED_CACHE_SIZE
cmp $SHARED_CACHE_SIZE, %r8
#else
cmp __x86_64_shared_cache_size(%rip), %r8
#endif
ja L(non_temporal_loop)
ALIGN (4)
L(normal_loop):
vmovdqa %ymm0, (%rcx)
vmovdqa %ymm0, 32(%rcx)
vmovdqa %ymm0, 64(%rcx)
vmovdqa %ymm0, 96(%rcx)
addq $128, %rcx
cmpq %rcx, %rdx
jne L(normal_loop)
jmp L(done)
ALIGN (4)
L(non_temporal_loop):
movntdq %xmm0, (%rcx)
movntdq %xmm0, 16(%rcx)
movntdq %xmm0, 32(%rcx)
movntdq %xmm0, 48(%rcx)
movntdq %xmm0, 64(%rcx)
movntdq %xmm0, 80(%rcx)
movntdq %xmm0, 96(%rcx)
movntdq %xmm0, 112(%rcx)
leaq 128(%rcx), %rcx
cmpq %rcx, %rdx
jne L(non_temporal_loop)
/* We used non-temporal stores, so we need a fence here. */
sfence
L(done):
/* We used the ymm registers, and that can break SSE2 performance
* unless you do this.
*/
vzeroupper
ret
END(memset)