e6553eee5a
port optimized string functions for x86_64 from Bionic (BSD licensed) Signed-off-by: p-szafonimateusz <p-szafonimateusz@xiaomi.com>
180 lines
4.7 KiB
ArmAsm
180 lines
4.7 KiB
ArmAsm
/*********************************************************************************
|
|
* libs/libc/machine/x86_64/gnu/arch_memset_avx2.S
|
|
*
|
|
* Copyright (c) 2014, Intel Corporation
|
|
* All rights reserved.
|
|
*
|
|
* Redistribution and use in source and binary forms, with or without
|
|
* modification, are permitted provided that the following conditions are met:
|
|
*
|
|
* * Redistributions of source code must retain the above copyright notice,
|
|
* * this list of conditions and the following disclaimer.
|
|
*
|
|
* * Redistributions in binary form must reproduce the above copyright notice,
|
|
* * this list of conditions and the following disclaimer in the documentation
|
|
* * and/or other materials provided with the distribution.
|
|
*
|
|
* * Neither the name of Intel Corporation nor the names of its contributors
|
|
* * may be used to endorse or promote products derived from this software
|
|
* * without specific prior written permission.
|
|
*
|
|
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
|
|
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
|
|
* WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
|
* DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
|
|
* ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
|
|
* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
|
|
* LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
|
|
* ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
* SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*
|
|
*********************************************************************************/
|
|
|
|
/********************************************************************************
|
|
* Included Files
|
|
*********************************************************************************/
|
|
|
|
#include "cache.h"
|
|
|
|
/*********************************************************************************
|
|
* Pre-processor Definitions
|
|
*********************************************************************************/
|
|
|
|
#ifndef L
|
|
# define L(label) .L##label
|
|
#endif
|
|
|
|
#ifndef ALIGN
|
|
# define ALIGN(n) .p2align n
|
|
#endif
|
|
|
|
#define ENTRY(__f) \
|
|
.text; \
|
|
.global __f; \
|
|
.balign 16; \
|
|
.type __f, @function; \
|
|
__f: \
|
|
.cfi_startproc;
|
|
|
|
#define END(__f) \
|
|
.cfi_endproc; \
|
|
.size __f, .- __f;
|
|
|
|
/*********************************************************************************
|
|
* Public Functions
|
|
*********************************************************************************/
|
|
|
|
.section .text.avx2,"ax",@progbits
|
|
|
|
ENTRY(memset)
|
|
movq %rdi, %rax
|
|
and $0xff, %rsi
|
|
mov $0x0101010101010101, %rcx
|
|
imul %rsi, %rcx
|
|
cmpq $16, %rdx
|
|
jae L(16bytesormore)
|
|
testb $8, %dl
|
|
jnz L(8_15bytes)
|
|
testb $4, %dl
|
|
jnz L(4_7bytes)
|
|
testb $2, %dl
|
|
jnz L(2_3bytes)
|
|
testb $1, %dl
|
|
jz 1f
|
|
movb %cl, (%rdi)
|
|
1: ret
|
|
|
|
L(8_15bytes):
|
|
movq %rcx, (%rdi)
|
|
movq %rcx, -8(%rdi, %rdx)
|
|
ret
|
|
|
|
L(4_7bytes):
|
|
movl %ecx, (%rdi)
|
|
movl %ecx, -4(%rdi, %rdx)
|
|
ret
|
|
|
|
L(2_3bytes):
|
|
movw %cx, (%rdi)
|
|
movw %cx, -2(%rdi, %rdx)
|
|
ret
|
|
|
|
ALIGN (4)
|
|
L(16bytesormore):
|
|
movd %rcx, %xmm0
|
|
pshufd $0, %xmm0, %xmm0
|
|
movdqu %xmm0, (%rdi)
|
|
movdqu %xmm0, -16(%rdi, %rdx)
|
|
cmpq $32, %rdx
|
|
jbe L(done)
|
|
movdqu %xmm0, 16(%rdi)
|
|
movdqu %xmm0, -32(%rdi, %rdx)
|
|
cmpq $64, %rdx
|
|
jbe L(done)
|
|
movdqu %xmm0, 32(%rdi)
|
|
movdqu %xmm0, 48(%rdi)
|
|
movdqu %xmm0, -64(%rdi, %rdx)
|
|
movdqu %xmm0, -48(%rdi, %rdx)
|
|
cmpq $128, %rdx
|
|
jbe L(done)
|
|
vpbroadcastb %xmm0, %ymm0
|
|
vmovdqu %ymm0, 64(%rdi)
|
|
vmovdqu %ymm0, 96(%rdi)
|
|
vmovdqu %ymm0, -128(%rdi, %rdx)
|
|
vmovdqu %ymm0, -96(%rdi, %rdx)
|
|
cmpq $256, %rdx
|
|
jbe L(done)
|
|
|
|
ALIGN (4)
|
|
leaq 128(%rdi), %rcx
|
|
andq $-128, %rcx
|
|
movq %rdx, %r8
|
|
addq %rdi, %rdx
|
|
andq $-128, %rdx
|
|
cmpq %rcx, %rdx
|
|
je L(done)
|
|
|
|
#ifdef SHARED_CACHE_SIZE
|
|
cmp $SHARED_CACHE_SIZE, %r8
|
|
#else
|
|
cmp __x86_64_shared_cache_size(%rip), %r8
|
|
#endif
|
|
ja L(non_temporal_loop)
|
|
|
|
ALIGN (4)
|
|
L(normal_loop):
|
|
vmovdqa %ymm0, (%rcx)
|
|
vmovdqa %ymm0, 32(%rcx)
|
|
vmovdqa %ymm0, 64(%rcx)
|
|
vmovdqa %ymm0, 96(%rcx)
|
|
addq $128, %rcx
|
|
cmpq %rcx, %rdx
|
|
jne L(normal_loop)
|
|
jmp L(done)
|
|
|
|
ALIGN (4)
|
|
L(non_temporal_loop):
|
|
movntdq %xmm0, (%rcx)
|
|
movntdq %xmm0, 16(%rcx)
|
|
movntdq %xmm0, 32(%rcx)
|
|
movntdq %xmm0, 48(%rcx)
|
|
movntdq %xmm0, 64(%rcx)
|
|
movntdq %xmm0, 80(%rcx)
|
|
movntdq %xmm0, 96(%rcx)
|
|
movntdq %xmm0, 112(%rcx)
|
|
leaq 128(%rcx), %rcx
|
|
cmpq %rcx, %rdx
|
|
jne L(non_temporal_loop)
|
|
/* We used non-temporal stores, so we need a fence here. */
|
|
sfence
|
|
|
|
L(done):
|
|
/* We used the ymm registers, and that can break SSE2 performance
|
|
* unless you do this.
|
|
*/
|
|
vzeroupper
|
|
ret
|
|
|
|
END(memset)
|