diff --git a/LICENSE b/LICENSE index 8913ecfb81..b24ee0ff1e 100644 --- a/LICENSE +++ b/LICENSE @@ -8634,3 +8634,43 @@ HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +libs/libc/machine/x86_64/gnu/arch_memcmp.S +libs/libc/machine/x86_64/gnu/arch_memset_avx2.S +libs/libc/machine/x86_64/gnu/arch_memset_sse2.S +libs/libc/machine/x86_64/gnu/arch_stpcpy.S +libs/libc/machine/x86_64/gnu/arch_stpncpy.S +libs/libc/machine/x86_64/gnu/arch_strcat.S +libs/libc/machine/x86_64/gnu/arch_strcpy.S +libs/libc/machine/x86_64/gnu/arch_strlen.S +libs/libc/machine/x86_64/gnu/arch_strncmp.S +libs/libc/machine/x86_64/gnu/arch_strncpy.S +libs/libc/machine/x86_64/gnu/cache.h +====================== +Copyright (c) 2014, Intel Corporation +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + + * Neither the name of Intel Corporation nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON +ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/arch/Kconfig b/arch/Kconfig index c297605e08..074067a333 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -138,6 +138,7 @@ config ARCH_X86_64 select ARCH_HAVE_INTERRUPTSTACK select ARCH_HAVE_CUSTOMOPT select LIBC_ARCH_ELF_64BIT if LIBC_ARCH_ELF + select ARCH_TOOLCHAIN_GNU ---help--- x86-64 architectures. diff --git a/libs/libc/machine/Kconfig b/libs/libc/machine/Kconfig index 2c45e16a10..1c1c601186 100644 --- a/libs/libc/machine/Kconfig +++ b/libs/libc/machine/Kconfig @@ -128,6 +128,14 @@ config LIBC_ARCH_STRNCASECMP bool default n +config LIBC_ARCH_STPCPY + bool + default n + +config LIBC_ARCH_STPNCPY + bool + default n + config LIBC_ARCH_ELF bool default n @@ -788,6 +796,9 @@ endif if ARCH_X86 source "libs/libc/machine/x86/Kconfig" endif +if ARCH_X86_64 +source "libs/libc/machine/x86_64/Kconfig" +endif if ARCH_XTENSA source "libs/libc/machine/xtensa/Kconfig" endif diff --git a/libs/libc/machine/Make.defs b/libs/libc/machine/Make.defs index 207500be9e..2acd957e52 100644 --- a/libs/libc/machine/Make.defs +++ b/libs/libc/machine/Make.defs @@ -37,6 +37,9 @@ endif ifeq ($(CONFIG_ARCH_X86),y) include $(TOPDIR)/libs/libc/machine/x86/Make.defs endif +ifeq ($(CONFIG_ARCH_X86_64),y) +include $(TOPDIR)/libs/libc/machine/x86_64/Make.defs +endif ifeq ($(CONFIG_ARCH_XTENSA),y) include $(TOPDIR)/libs/libc/machine/xtensa/Make.defs endif diff --git a/libs/libc/machine/x86_64/CMakeLists.txt b/libs/libc/machine/x86_64/CMakeLists.txt index 9d6e098238..1fcd9ef065 100644 --- a/libs/libc/machine/x86_64/CMakeLists.txt +++ b/libs/libc/machine/x86_64/CMakeLists.txt @@ -17,3 +17,9 @@ # the License. # # ############################################################################## + +add_subdirectory(gnu) + +set(SRCS) + +target_sources(c PRIVATE ${SRCS}) diff --git a/libs/libc/machine/x86_64/Kconfig b/libs/libc/machine/x86_64/Kconfig index f72f3c094c..5e1ef94b29 100644 --- a/libs/libc/machine/x86_64/Kconfig +++ b/libs/libc/machine/x86_64/Kconfig @@ -2,3 +2,94 @@ # For a description of the syntax of this configuration file, # see the file kconfig-language.txt in the NuttX tools repository. # + +if ARCH_TOOLCHAIN_GNU && ALLOW_BSD_COMPONENTS + +config X86_64_MEMCMP + bool "Enable optimized memcmp() for X86_64" + select LIBC_ARCH_MEMCMP + depends on ARCH_HAVE_SSE41 + ---help--- + Enable optimized X86_64 specific memcmp() library function + +config X86_64_MEMMOVE + bool "Enable optimized memmove()/memcpy() for X86_64" + default n + select LIBC_ARCH_MEMMOVE + select LIBC_ARCH_MEMCPY + ---help--- + Enable optimized X86_64 specific memmove()/memcpy() library function + +config X86_64_MEMSET + bool "Enable optimized memset() for X86_64" + default n + select LIBC_ARCH_MEMSET + ---help--- + Enable optimized X86_64 specific memset() library function + +config X86_64_STPCPY + bool "Enable optimized stpcpy() for X86_64" + default n + select LIBC_ARCH_STPCPY + ---help--- + Enable optimized X86_64 specific stpcpy() library function + +config X86_64_STPNCPY + bool "Enable optimized stpncpy() for X86_64" + default n + select LIBC_ARCH_STPNCPY + ---help--- + Enable optimized X86_64 specific stpncpy() library function + +config X86_64_STRCAT + bool "Enable optimized strcat() for X86_64" + default n + select LIBC_ARCH_STRCAT + ---help--- + Enable optimized X86_64 specific strcat() library function + +config X86_64_STRCMP + bool "Enable optimized strcmp() for X86_64" + default n + select LIBC_ARCH_STRCMP + depends on ARCH_HAVE_SSSE3 + ---help--- + Enable optimized X86_64 specific strcmp() library function + +config X86_64_STRNCMP + bool "Enable optimized strncmp() for X86_64" + default n + select LIBC_ARCH_STRNCMP + depends on ARCH_HAVE_SSSE3 + ---help--- + Enable optimized X86_64 specific strncmp() library function + +config X86_64_STRCPY + bool "Enable optimized strcpy() for X86_64" + default n + select LIBC_ARCH_STRCPY + ---help--- + Enable optimized X86_64 specific strcpy() library function + +config X86_64_STRLEN + bool "Enable optimized strlen() for X86_64" + default n + select LIBC_ARCH_STRLEN + ---help--- + Enable optimized X86_64 specific strlen() library function + +config X86_64_STRNCPY + bool "Enable optimized strncpy() for X86_64" + default n + select LIBC_ARCH_STRNCPY + ---help--- + Enable optimized X86_64 specific strncpy() library function + +config X86_64_STRNCMP + bool "Enable optimized strncmp() for X86_64" + default n + select LIBC_ARCH_STRNCMP + ---help--- + Enable optimized X86_64 specific strncmp() library function + +endif # ARCH_TOOLCHAIN_GNU && ALLOW_BSD_COMPONENTS diff --git a/libs/libc/machine/x86_64/Make.defs b/libs/libc/machine/x86_64/Make.defs new file mode 100644 index 0000000000..a7d570207f --- /dev/null +++ b/libs/libc/machine/x86_64/Make.defs @@ -0,0 +1,75 @@ +############################################################################ +# libs/libc/machine/x86_64/Make.defs +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. The +# ASF licenses this file to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance with the +# License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations +# under the License. +# +############################################################################ + +ifeq ($(CONFIG_X86_64_MEMCMP),y) +ASRCS += arch_memcmp.S +endif + +ifeq ($(CONFIG_X86_64_MEMMOVE),y) +ASRCS += arch_memmove.S +endif + +ifeq ($(CONFIG_X86_64_MEMSET),y) + ifeq ($(CONFIG_ARCH_X86_64_AVX),y) + ASRCS += arch_memset_avx2.S + else + ASRCS += arch_memset_sse2.S + endif +endif + +ifeq ($(CONFIG_X86_64_STPCPY),y) +ASRCS += arch_stpcpy.S +endif + +ifeq ($(CONFIG_X86_64_STPNCPY),y) +ASRCS += arch_stpncpy.S +endif + +ifeq ($(CONFIG_X86_64_STRCAT),y) +ASRCS += arch_strcat.S +endif + +ifeq ($(CONFIG_X86_64_STRCMP),y) +ASRCS += arch_strcmp.S +endif + +ifeq ($(CONFIG_X86_64_STRCPY),y) +ASRCS += arch_strcpy.S +endif + +ifeq ($(CONFIG_X86_64_STRLEN),y) +ASRCS += arch_strlen.S +endif + +ifeq ($(CONFIG_X86_64_STRNCPY),y) +ASRCS += arch_strncpy.S +endif + +ifeq ($(CONFIG_X86_64_STRNCMP),y) +ASRCS += arch_strncmp.S +endif + +ifeq ($(CONFIG_ARCH_TOOLCHAIN_GNU),y) +DEPPATH += --dep-path machine/x86_64/gnu +VPATH += :machine/x86_64/gnu +endif + +DEPPATH += --dep-path machine/x86_64 +VPATH += :machine/x86_64 diff --git a/libs/libc/machine/x86_64/gnu/CMakeLists.txt b/libs/libc/machine/x86_64/gnu/CMakeLists.txt new file mode 100644 index 0000000000..c2d4ed8331 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/CMakeLists.txt @@ -0,0 +1,71 @@ +# ############################################################################## +# libs/libc/machine/X86_64/gnu/CMakeLists.txt +# +# Licensed to the Apache Software Foundation (ASF) under one or more contributor +# license agreements. See the NOTICE file distributed with this work for +# additional information regarding copyright ownership. The ASF licenses this +# file to you under the Apache License, Version 2.0 (the "License"); you may not +# use this file except in compliance with the License. You may obtain a copy of +# the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +# License for the specific language governing permissions and limitations under +# the License. +# +# ############################################################################## + +set(SRCS) + +if(CONFIG_X86_64_MEMCMP) + list(APPEND SRCS arch_memcmp.S) +endif() + +if(CONFIG_X86_64_MEMMOVE) + list(APPEND SRCS arch_memmove.S) +endif() + +if(CONFIG_X86_64_MEMSET) + if(CONFIG_ARCH_X86_64_AVX) + list(APPEND SRCS arch_memset_avx2.S) + else() + list(APPEND SRCS arch_memset_sse2.S) + endif() +endif() + +if(CONFIG_X86_64_STPCPY) + list(APPEND SRCS arch_stpcpy.S) +endif() + +if(CONFIG_X86_64_STPNCPY) + list(APPEND SRCS arch_stpncpy.S) +endif() + +if(CONFIG_X86_64_STRCAT) + list(APPEND SRCS arch_strcat.S) +endif() + +if(CONFIG_X86_64_STRCMP) + list(APPEND SRCS arch_strcmp.S) +endif() + +if(CONFIG_X86_64_STRCPY) + list(APPEND SRCS arch_strcpy.S) +endif() + +if(CONFIG_X86_64_STRLEN) + list(APPEND SRCS arch_strlen.S) +endif() + +if(CONFIG_X86_64_STRNCPY) + list(APPEND SRCS arch_strncpy.S) +endif() + +if(CONFIG_X86_64_STRNCMP) + list(APPEND SRCS arch_strncmp.S) +endif() + +target_sources(c PRIVATE ${SRCS}) diff --git a/libs/libc/machine/x86_64/gnu/arch_memcmp.S b/libs/libc/machine/x86_64/gnu/arch_memcmp.S new file mode 100644 index 0000000000..e4b8a724ca --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_memcmp.S @@ -0,0 +1,1814 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_memcmp.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Included Files + *********************************************************************************/ + +#include "cache.h" + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define JMPTBL(I, B) (I - B) + +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + add %r11, %rcx; \ + jmp *%rcx; \ + ud2 + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.sse4.1,"ax",@progbits +ENTRY (MEMCMP) +#ifdef USE_AS_WMEMCMP + shl $2, %rdx +#endif + pxor %xmm0, %xmm0 + cmp $79, %rdx + ja L(79bytesormore) +#ifndef USE_AS_WMEMCMP + cmp $1, %rdx + je L(firstbyte) +#endif + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +#ifndef USE_AS_WMEMCMP + ALIGN (4) +L(firstbyte): + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax + ret +#endif + + ALIGN (4) +L(79bytesormore): + movdqu (%rsi), %xmm1 + movdqu (%rdi), %xmm2 + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi + sub %rsi, %rcx + + sub %rcx, %rdi + add %rcx, %rdx + test $0xf, %rdi + jz L(2aligned) + + cmp $128, %rdx + ja L(128bytesormore) +L(less128bytes): + sub $64, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(128bytesormore): + cmp $512, %rdx + ja L(512bytesormore) + cmp $256, %rdx + ja L(less512bytes) +L(less256bytes): + sub $128, %rdx + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin128) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(less512bytes): + sub $256, %rdx + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqu 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqu 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqu 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqu 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqu 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqu 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqu 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqu 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqu 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqu 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqu 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqu 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqu 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqu 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytes) + + cmp $64, %rdx + jae L(less128bytes) + + cmp $32, %rdx + jb L(less32bytesin256) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormore): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_unaglined) + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loop): + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +L(L2_L3_cache_unaglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqu 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqu 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqu 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_unaligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +/* + * This case is for machines which are sensitive for unaligned instructions. + */ + ALIGN (4) +L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) +L(less128bytesin2aligned): + sub $64, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + cmp $32, %rdx + jb L(less32bytesin64in2alinged) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin64in2alinged): + add $64, %rdi + add $64, %rsi + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(128bytesormorein2aligned): + cmp $512, %rdx + ja L(512bytesormorein2aligned) + cmp $256, %rdx + ja L(256bytesormorein2aligned) +L(less256bytesin2alinged): + sub $128, %rdx + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + add $128, %rsi + add $128, %rdi + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin128in2aligned) + + movdqu (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqu 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin128in2aligned): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(256bytesormorein2aligned): + + sub $256, %rdx + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + + movdqa 32(%rdi), %xmm2 + pxor 32(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(48bytesin256) + + movdqa 48(%rdi), %xmm2 + pxor 48(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(64bytesin256) + + movdqa 64(%rdi), %xmm2 + pxor 64(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(80bytesin256) + + movdqa 80(%rdi), %xmm2 + pxor 80(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(96bytesin256) + + movdqa 96(%rdi), %xmm2 + pxor 96(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(112bytesin256) + + movdqa 112(%rdi), %xmm2 + pxor 112(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(128bytesin256) + + movdqa 128(%rdi), %xmm2 + pxor 128(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(144bytesin256) + + movdqa 144(%rdi), %xmm2 + pxor 144(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(160bytesin256) + + movdqa 160(%rdi), %xmm2 + pxor 160(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(176bytesin256) + + movdqa 176(%rdi), %xmm2 + pxor 176(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(192bytesin256) + + movdqa 192(%rdi), %xmm2 + pxor 192(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(208bytesin256) + + movdqa 208(%rdi), %xmm2 + pxor 208(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(224bytesin256) + + movdqa 224(%rdi), %xmm2 + pxor 224(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(240bytesin256) + + movdqa 240(%rdi), %xmm2 + pxor 240(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(256bytesin256) + + add $256, %rsi + add $256, %rdi + + cmp $128, %rdx + jae L(less256bytesin2alinged) + + cmp $64, %rdx + jae L(less128bytesin2aligned) + + cmp $32, %rdx + jb L(less32bytesin256in2alinged) + + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(16bytesin256) + + movdqa 16(%rdi), %xmm2 + pxor 16(%rsi), %xmm2 + ptest %xmm2, %xmm0 + jnc L(32bytesin256) + sub $32, %rdx + add $32, %rdi + add $32, %rsi +L(less32bytesin256in2alinged): + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + ALIGN (4) +L(512bytesormorein2aligned): +#ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %r8 +#else + mov __x86_64_data_cache_size_half(%rip), %r8 +#endif + mov %r8, %r9 + shr $1, %r8 + add %r9, %r8 + cmp %r8, %rdx + ja L(L2_L3_cache_aglined) + + sub $64, %rdx + ALIGN (4) +L(64bytesormore_loopin2aligned): + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(64bytesormore_loopin2aligned) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +L(L2_L3_cache_aglined): + sub $64, %rdx + ALIGN (4) +L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) + movdqa (%rdi), %xmm2 + pxor (%rsi), %xmm2 + movdqa %xmm2, %xmm1 + + movdqa 16(%rdi), %xmm3 + pxor 16(%rsi), %xmm3 + por %xmm3, %xmm1 + + movdqa 32(%rdi), %xmm4 + pxor 32(%rsi), %xmm4 + por %xmm4, %xmm1 + + movdqa 48(%rdi), %xmm5 + pxor 48(%rsi), %xmm5 + por %xmm5, %xmm1 + + ptest %xmm1, %xmm0 + jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx + jae L(L2_L3_aligned_128bytes_loop) + + add $64, %rdx + add %rdx, %rsi + add %rdx, %rdi + BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + + + ALIGN (4) +L(64bytesormore_loop_end): + add $16, %rdi + add $16, %rsi + ptest %xmm2, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm3, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + ptest %xmm4, %xmm0 + jnc L(16bytes) + + add $16, %rdi + add $16, %rsi + jmp L(16bytes) + +L(256bytesin256): + add $256, %rdi + add $256, %rsi + jmp L(16bytes) +L(240bytesin256): + add $240, %rdi + add $240, %rsi + jmp L(16bytes) +L(224bytesin256): + add $224, %rdi + add $224, %rsi + jmp L(16bytes) +L(208bytesin256): + add $208, %rdi + add $208, %rsi + jmp L(16bytes) +L(192bytesin256): + add $192, %rdi + add $192, %rsi + jmp L(16bytes) +L(176bytesin256): + add $176, %rdi + add $176, %rsi + jmp L(16bytes) +L(160bytesin256): + add $160, %rdi + add $160, %rsi + jmp L(16bytes) +L(144bytesin256): + add $144, %rdi + add $144, %rsi + jmp L(16bytes) +L(128bytesin256): + add $128, %rdi + add $128, %rsi + jmp L(16bytes) +L(112bytesin256): + add $112, %rdi + add $112, %rsi + jmp L(16bytes) +L(96bytesin256): + add $96, %rdi + add $96, %rsi + jmp L(16bytes) +L(80bytesin256): + add $80, %rdi + add $80, %rsi + jmp L(16bytes) +L(64bytesin256): + add $64, %rdi + add $64, %rsi + jmp L(16bytes) +L(48bytesin256): + add $16, %rdi + add $16, %rsi +L(32bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytesin256): + add $16, %rdi + add $16, %rsi +L(16bytes): + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(8bytes): + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(12bytes): + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(4bytes): + mov -4(%rsi), %ecx + mov -4(%rdi), %eax + cmp %eax, %ecx + jne L(diffin4bytes) +L(0bytes): + xor %eax, %eax + ret + +#ifndef USE_AS_WMEMCMP +/* unreal case for wmemcmp */ + ALIGN (4) +L(65bytes): + movdqu -65(%rdi), %xmm1 + movdqu -65(%rsi), %xmm2 + mov $-65, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(49bytes): + movdqu -49(%rdi), %xmm1 + movdqu -49(%rsi), %xmm2 + mov $-49, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(33bytes): + movdqu -33(%rdi), %xmm1 + movdqu -33(%rsi), %xmm2 + mov $-33, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(17bytes): + mov -17(%rdi), %rax + mov -17(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(9bytes): + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(13bytes): + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(5bytes): + mov -5(%rdi), %eax + mov -5(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %edx + sub %edx, %eax + ret + + ALIGN (4) +L(66bytes): + movdqu -66(%rdi), %xmm1 + movdqu -66(%rsi), %xmm2 + mov $-66, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(50bytes): + movdqu -50(%rdi), %xmm1 + movdqu -50(%rsi), %xmm2 + mov $-50, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(34bytes): + movdqu -34(%rdi), %xmm1 + movdqu -34(%rsi), %xmm2 + mov $-34, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(18bytes): + mov -18(%rdi), %rax + mov -18(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(10bytes): + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(14bytes): + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(6bytes): + mov -6(%rdi), %eax + mov -6(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) +L(2bytes): + movzwl -2(%rsi), %ecx + movzwl -2(%rdi), %eax + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(67bytes): + movdqu -67(%rdi), %xmm2 + movdqu -67(%rsi), %xmm1 + mov $-67, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(51bytes): + movdqu -51(%rdi), %xmm2 + movdqu -51(%rsi), %xmm1 + mov $-51, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(35bytes): + movdqu -35(%rsi), %xmm1 + movdqu -35(%rdi), %xmm2 + mov $-35, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(19bytes): + mov -19(%rdi), %rax + mov -19(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) +L(11bytes): + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(15bytes): + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(7bytes): + mov -7(%rdi), %eax + mov -7(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(3bytes): + movzwl -3(%rdi), %eax + movzwl -3(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin2bytes) +L(1bytes): + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret +#endif + + ALIGN (4) +L(68bytes): + movdqu -68(%rdi), %xmm2 + movdqu -68(%rsi), %xmm1 + mov $-68, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(52bytes): + movdqu -52(%rdi), %xmm2 + movdqu -52(%rsi), %xmm1 + mov $-52, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(36bytes): + movdqu -36(%rdi), %xmm2 + movdqu -36(%rsi), %xmm1 + mov $-36, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(20bytes): + movdqu -20(%rdi), %xmm2 + movdqu -20(%rsi), %xmm1 + mov $-20, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + +#ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + ALIGN (4) +L(69bytes): + movdqu -69(%rsi), %xmm1 + movdqu -69(%rdi), %xmm2 + mov $-69, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(53bytes): + movdqu -53(%rsi), %xmm1 + movdqu -53(%rdi), %xmm2 + mov $-53, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(37bytes): + movdqu -37(%rsi), %xmm1 + movdqu -37(%rdi), %xmm2 + mov $-37, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(21bytes): + movdqu -21(%rsi), %xmm1 + movdqu -21(%rdi), %xmm2 + mov $-21, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(70bytes): + movdqu -70(%rsi), %xmm1 + movdqu -70(%rdi), %xmm2 + mov $-70, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(54bytes): + movdqu -54(%rsi), %xmm1 + movdqu -54(%rdi), %xmm2 + mov $-54, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(38bytes): + movdqu -38(%rsi), %xmm1 + movdqu -38(%rdi), %xmm2 + mov $-38, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(22bytes): + movdqu -22(%rsi), %xmm1 + movdqu -22(%rdi), %xmm2 + mov $-22, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(71bytes): + movdqu -71(%rsi), %xmm1 + movdqu -71(%rdi), %xmm2 + mov $-71, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(55bytes): + movdqu -55(%rdi), %xmm2 + movdqu -55(%rsi), %xmm1 + mov $-55, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(39bytes): + movdqu -39(%rdi), %xmm2 + movdqu -39(%rsi), %xmm1 + mov $-39, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(23bytes): + movdqu -23(%rdi), %xmm2 + movdqu -23(%rsi), %xmm1 + mov $-23, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +#endif + + ALIGN (4) +L(72bytes): + movdqu -72(%rsi), %xmm1 + movdqu -72(%rdi), %xmm2 + mov $-72, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(56bytes): + movdqu -56(%rdi), %xmm2 + movdqu -56(%rsi), %xmm1 + mov $-56, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(40bytes): + movdqu -40(%rdi), %xmm2 + movdqu -40(%rsi), %xmm1 + mov $-40, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(24bytes): + movdqu -24(%rdi), %xmm2 + movdqu -24(%rsi), %xmm1 + mov $-24, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +#ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + ALIGN (4) +L(73bytes): + movdqu -73(%rsi), %xmm1 + movdqu -73(%rdi), %xmm2 + mov $-73, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(57bytes): + movdqu -57(%rdi), %xmm2 + movdqu -57(%rsi), %xmm1 + mov $-57, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(41bytes): + movdqu -41(%rdi), %xmm2 + movdqu -41(%rsi), %xmm1 + mov $-41, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(25bytes): + movdqu -25(%rdi), %xmm2 + movdqu -25(%rsi), %xmm1 + mov $-25, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -9(%rdi), %rax + mov -9(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzbl -1(%rdi), %eax + movzbl -1(%rsi), %ecx + sub %ecx, %eax + ret + + ALIGN (4) +L(74bytes): + movdqu -74(%rsi), %xmm1 + movdqu -74(%rdi), %xmm2 + mov $-74, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(58bytes): + movdqu -58(%rdi), %xmm2 + movdqu -58(%rsi), %xmm1 + mov $-58, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(42bytes): + movdqu -42(%rdi), %xmm2 + movdqu -42(%rsi), %xmm1 + mov $-42, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(26bytes): + movdqu -26(%rdi), %xmm2 + movdqu -26(%rsi), %xmm1 + mov $-26, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -10(%rdi), %rax + mov -10(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + movzwl -2(%rdi), %eax + movzwl -2(%rsi), %ecx + jmp L(diffin2bytes) + + ALIGN (4) +L(75bytes): + movdqu -75(%rsi), %xmm1 + movdqu -75(%rdi), %xmm2 + mov $-75, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(59bytes): + movdqu -59(%rdi), %xmm2 + movdqu -59(%rsi), %xmm1 + mov $-59, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(43bytes): + movdqu -43(%rdi), %xmm2 + movdqu -43(%rsi), %xmm1 + mov $-43, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(27bytes): + movdqu -27(%rdi), %xmm2 + movdqu -27(%rsi), %xmm1 + mov $-27, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -11(%rdi), %rax + mov -11(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +#endif + ALIGN (4) +L(76bytes): + movdqu -76(%rsi), %xmm1 + movdqu -76(%rdi), %xmm2 + mov $-76, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(60bytes): + movdqu -60(%rdi), %xmm2 + movdqu -60(%rsi), %xmm1 + mov $-60, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(44bytes): + movdqu -44(%rdi), %xmm2 + movdqu -44(%rsi), %xmm1 + mov $-44, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(28bytes): + movdqu -28(%rdi), %xmm2 + movdqu -28(%rsi), %xmm1 + mov $-28, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -12(%rdi), %rax + mov -12(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -4(%rdi), %eax + mov -4(%rsi), %ecx + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret + +#ifndef USE_AS_WMEMCMP +/* unreal cases for wmemcmp */ + ALIGN (4) +L(77bytes): + movdqu -77(%rsi), %xmm1 + movdqu -77(%rdi), %xmm2 + mov $-77, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(61bytes): + movdqu -61(%rdi), %xmm2 + movdqu -61(%rsi), %xmm1 + mov $-61, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(45bytes): + movdqu -45(%rdi), %xmm2 + movdqu -45(%rsi), %xmm1 + mov $-45, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(29bytes): + movdqu -29(%rdi), %xmm2 + movdqu -29(%rsi), %xmm1 + mov $-29, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -13(%rdi), %rax + mov -13(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(78bytes): + movdqu -78(%rsi), %xmm1 + movdqu -78(%rdi), %xmm2 + mov $-78, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(62bytes): + movdqu -62(%rdi), %xmm2 + movdqu -62(%rsi), %xmm1 + mov $-62, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(46bytes): + movdqu -46(%rdi), %xmm2 + movdqu -46(%rsi), %xmm1 + mov $-46, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(30bytes): + movdqu -30(%rdi), %xmm2 + movdqu -30(%rsi), %xmm1 + mov $-30, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -14(%rdi), %rax + mov -14(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + + ALIGN (4) +L(79bytes): + movdqu -79(%rsi), %xmm1 + movdqu -79(%rdi), %xmm2 + mov $-79, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(63bytes): + movdqu -63(%rdi), %xmm2 + movdqu -63(%rsi), %xmm1 + mov $-63, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(47bytes): + movdqu -47(%rdi), %xmm2 + movdqu -47(%rsi), %xmm1 + mov $-47, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(31bytes): + movdqu -31(%rdi), %xmm2 + movdqu -31(%rsi), %xmm1 + mov $-31, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + mov -15(%rdi), %rax + mov -15(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret +#endif + ALIGN (4) +L(64bytes): + movdqu -64(%rdi), %xmm2 + movdqu -64(%rsi), %xmm1 + mov $-64, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(48bytes): + movdqu -48(%rdi), %xmm2 + movdqu -48(%rsi), %xmm1 + mov $-48, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) +L(32bytes): + movdqu -32(%rdi), %xmm2 + movdqu -32(%rsi), %xmm1 + mov $-32, %dl + pxor %xmm1, %xmm2 + ptest %xmm2, %xmm0 + jnc L(less16bytes) + + mov -16(%rdi), %rax + mov -16(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + + mov -8(%rdi), %rax + mov -8(%rsi), %rcx + cmp %rax, %rcx + jne L(diffin8bytes) + xor %eax, %eax + ret + +/* + * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. + */ + ALIGN (3) +L(less16bytes): + movsbq %dl, %rdx + mov (%rsi, %rdx), %rcx + mov (%rdi, %rdx), %rax + cmp %rax, %rcx + jne L(diffin8bytes) + mov 8(%rsi, %rdx), %rcx + mov 8(%rdi, %rdx), %rax +L(diffin8bytes): + cmp %eax, %ecx + jne L(diffin4bytes) + shr $32, %rcx + shr $32, %rax + +#ifdef USE_AS_WMEMCMP +/* for wmemcmp */ + cmp %eax, %ecx + jne L(diffin4bytes) + xor %eax, %eax + ret +#endif + +L(diffin4bytes): +#ifndef USE_AS_WMEMCMP + cmp %cx, %ax + jne L(diffin2bytes) + shr $16, %ecx + shr $16, %eax +L(diffin2bytes): + cmp %cl, %al + jne L(end) + and $0xffff, %eax + and $0xffff, %ecx + sub %ecx, %eax + ret +#else + +/* for wmemcmp */ + mov $1, %eax + jl L(nequal_bigger) + neg %eax + ret + + ALIGN (4) +L(nequal_bigger): + ret + +L(unreal_case): + xor %eax, %eax + ret +#endif + + ALIGN (4) +L(end): + and $0xff, %eax + and $0xff, %ecx + sub %ecx, %eax + ret + +END (MEMCMP) + + .section .rodata.sse4.1,"a",@progbits + ALIGN (3) +#ifndef USE_AS_WMEMCMP +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(1bytes), L(table_64bytes)) + .int JMPTBL (L(2bytes), L(table_64bytes)) + .int JMPTBL (L(3bytes), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(5bytes), L(table_64bytes)) + .int JMPTBL (L(6bytes), L(table_64bytes)) + .int JMPTBL (L(7bytes), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(9bytes), L(table_64bytes)) + .int JMPTBL (L(10bytes), L(table_64bytes)) + .int JMPTBL (L(11bytes), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(13bytes), L(table_64bytes)) + .int JMPTBL (L(14bytes), L(table_64bytes)) + .int JMPTBL (L(15bytes), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(17bytes), L(table_64bytes)) + .int JMPTBL (L(18bytes), L(table_64bytes)) + .int JMPTBL (L(19bytes), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(21bytes), L(table_64bytes)) + .int JMPTBL (L(22bytes), L(table_64bytes)) + .int JMPTBL (L(23bytes), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(25bytes), L(table_64bytes)) + .int JMPTBL (L(26bytes), L(table_64bytes)) + .int JMPTBL (L(27bytes), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(29bytes), L(table_64bytes)) + .int JMPTBL (L(30bytes), L(table_64bytes)) + .int JMPTBL (L(31bytes), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(33bytes), L(table_64bytes)) + .int JMPTBL (L(34bytes), L(table_64bytes)) + .int JMPTBL (L(35bytes), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(37bytes), L(table_64bytes)) + .int JMPTBL (L(38bytes), L(table_64bytes)) + .int JMPTBL (L(39bytes), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(41bytes), L(table_64bytes)) + .int JMPTBL (L(42bytes), L(table_64bytes)) + .int JMPTBL (L(43bytes), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(45bytes), L(table_64bytes)) + .int JMPTBL (L(46bytes), L(table_64bytes)) + .int JMPTBL (L(47bytes), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(49bytes), L(table_64bytes)) + .int JMPTBL (L(50bytes), L(table_64bytes)) + .int JMPTBL (L(51bytes), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(53bytes), L(table_64bytes)) + .int JMPTBL (L(54bytes), L(table_64bytes)) + .int JMPTBL (L(55bytes), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(57bytes), L(table_64bytes)) + .int JMPTBL (L(58bytes), L(table_64bytes)) + .int JMPTBL (L(59bytes), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(61bytes), L(table_64bytes)) + .int JMPTBL (L(62bytes), L(table_64bytes)) + .int JMPTBL (L(63bytes), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(65bytes), L(table_64bytes)) + .int JMPTBL (L(66bytes), L(table_64bytes)) + .int JMPTBL (L(67bytes), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(69bytes), L(table_64bytes)) + .int JMPTBL (L(70bytes), L(table_64bytes)) + .int JMPTBL (L(71bytes), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(73bytes), L(table_64bytes)) + .int JMPTBL (L(74bytes), L(table_64bytes)) + .int JMPTBL (L(75bytes), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(77bytes), L(table_64bytes)) + .int JMPTBL (L(78bytes), L(table_64bytes)) + .int JMPTBL (L(79bytes), L(table_64bytes)) +#else +L(table_64bytes): + .int JMPTBL (L(0bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(4bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(8bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(12bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(16bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(20bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(24bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(28bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(32bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(36bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(40bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(44bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(48bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(52bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(56bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(60bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(64bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(68bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(72bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(76bytes), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) + .int JMPTBL (L(unreal_case), L(table_64bytes)) +#endif diff --git a/libs/libc/machine/x86_64/gnu/arch_memmove.S b/libs/libc/machine/x86_64/gnu/arch_memmove.S new file mode 100644 index 0000000000..16abf91818 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_memmove.S @@ -0,0 +1,533 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_memmove.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Included Files + *********************************************************************************/ + +#include "cache.h" + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef MEMMOVE +# define MEMMOVE memmove +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef cfi_rel_offset +# define cfi_rel_offset(reg, off) .cfi_rel_offset reg, off +#endif + +#ifndef cfi_restore +# define cfi_restore(reg) .cfi_restore reg +#endif + +#ifndef cfi_adjust_cfa_offset +# define cfi_adjust_cfa_offset(off) .cfi_adjust_cfa_offset off +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef ALIAS_SYMBOL +# define ALIAS_SYMBOL(alias, original) \ + .globl alias; \ + .equ alias, original +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define CFI_PUSH(REG) \ + cfi_adjust_cfa_offset (4); \ + cfi_rel_offset (REG, 0) + +#define CFI_POP(REG) \ + cfi_adjust_cfa_offset (-4); \ + cfi_restore (REG) + +#define PUSH(REG) push REG; +#define POP(REG) pop REG; + +#define ENTRANCE PUSH (%rbx); +#define RETURN_END POP (%rbx); ret +#define RETURN RETURN_END; + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.sse2,"ax",@progbits +ENTRY (MEMMOVE) + ENTRANCE + mov %rdi, %rax + +/* Check whether we should copy backward or forward. */ + cmp %rsi, %rdi + je L(mm_return) + jg L(mm_len_0_or_more_backward) + +/* Now do checks for lengths. We do [0..16], [0..32], [0..64], [0..128] + separately. */ + cmp $16, %rdx + jbe L(mm_len_0_16_bytes_forward) + + cmp $32, %rdx + ja L(mm_len_32_or_more_forward) + +/* Copy [0..32] and return. */ + movdqu (%rsi), %xmm0 + movdqu -16(%rsi, %rdx), %xmm1 + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_32_or_more_forward): + cmp $64, %rdx + ja L(mm_len_64_or_more_forward) + +/* Copy [0..64] and return. */ + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu -16(%rsi, %rdx), %xmm2 + movdqu -32(%rsi, %rdx), %xmm3 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, -16(%rdi, %rdx) + movdqu %xmm3, -32(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_64_or_more_forward): + cmp $128, %rdx + ja L(mm_len_128_or_more_forward) + +/* Copy [0..128] and return. */ + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + movdqu -64(%rsi, %rdx), %xmm4 + movdqu -48(%rsi, %rdx), %xmm5 + movdqu -32(%rsi, %rdx), %xmm6 + movdqu -16(%rsi, %rdx), %xmm7 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm4, -64(%rdi, %rdx) + movdqu %xmm5, -48(%rdi, %rdx) + movdqu %xmm6, -32(%rdi, %rdx) + movdqu %xmm7, -16(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_128_or_more_forward): +/* Aligning the address of destination. */ +/* save first unaligned 64 bytes */ + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + + lea 64(%rdi), %r8 + and $-64, %r8 /* r8 now aligned to next 64 byte boundary */ + sub %rdi, %rsi /* rsi = src - dst = diff */ + + movdqu (%r8, %rsi), %xmm4 + movdqu 16(%r8, %rsi), %xmm5 + movdqu 32(%r8, %rsi), %xmm6 + movdqu 48(%r8, %rsi), %xmm7 + + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqa %xmm4, (%r8) + movaps %xmm5, 16(%r8) + movaps %xmm6, 32(%r8) + movaps %xmm7, 48(%r8) + add $64, %r8 + + lea (%rdi, %rdx), %rbx + and $-64, %rbx + cmp %r8, %rbx + jbe L(mm_copy_remaining_forward) + + cmp $SHARED_CACHE_SIZE_HALF, %rdx + jae L(mm_large_page_loop_forward) + + .p2align 4 +L(mm_main_loop_forward): + + prefetcht0 128(%r8, %rsi) + + movdqu (%r8, %rsi), %xmm0 + movdqu 16(%r8, %rsi), %xmm1 + movdqu 32(%r8, %rsi), %xmm2 + movdqu 48(%r8, %rsi), %xmm3 + movdqa %xmm0, (%r8) + movaps %xmm1, 16(%r8) + movaps %xmm2, 32(%r8) + movaps %xmm3, 48(%r8) + lea 64(%r8), %r8 + cmp %r8, %rbx + ja L(mm_main_loop_forward) + +L(mm_copy_remaining_forward): + add %rdi, %rdx + sub %r8, %rdx +/* We copied all up till %rdi position in the dst. + In %rdx now is how many bytes are left to copy. + Now we need to advance %r8. */ + lea (%r8, %rsi), %r9 + +L(mm_remaining_0_64_bytes_forward): + cmp $32, %rdx + ja L(mm_remaining_33_64_bytes_forward) + cmp $16, %rdx + ja L(mm_remaining_17_32_bytes_forward) + test %rdx, %rdx + .p2align 4,,2 + je L(mm_return) + + cmpb $8, %dl + ja L(mm_remaining_9_16_bytes_forward) + cmpb $4, %dl + .p2align 4,,5 + ja L(mm_remaining_5_8_bytes_forward) + cmpb $2, %dl + .p2align 4,,1 + ja L(mm_remaining_3_4_bytes_forward) + movzbl -1(%r9,%rdx), %esi + movzbl (%r9), %ebx + movb %sil, -1(%r8,%rdx) + movb %bl, (%r8) + jmp L(mm_return) + +L(mm_remaining_33_64_bytes_forward): + movdqu (%r9), %xmm0 + movdqu 16(%r9), %xmm1 + movdqu -32(%r9, %rdx), %xmm2 + movdqu -16(%r9, %rdx), %xmm3 + movdqu %xmm0, (%r8) + movdqu %xmm1, 16(%r8) + movdqu %xmm2, -32(%r8, %rdx) + movdqu %xmm3, -16(%r8, %rdx) + jmp L(mm_return) + +L(mm_remaining_17_32_bytes_forward): + movdqu (%r9), %xmm0 + movdqu -16(%r9, %rdx), %xmm1 + movdqu %xmm0, (%r8) + movdqu %xmm1, -16(%r8, %rdx) + jmp L(mm_return) + +L(mm_remaining_5_8_bytes_forward): + movl (%r9), %esi + movl -4(%r9,%rdx), %ebx + movl %esi, (%r8) + movl %ebx, -4(%r8,%rdx) + jmp L(mm_return) + +L(mm_remaining_9_16_bytes_forward): + mov (%r9), %rsi + mov -8(%r9, %rdx), %rbx + mov %rsi, (%r8) + mov %rbx, -8(%r8, %rdx) + jmp L(mm_return) + +L(mm_remaining_3_4_bytes_forward): + movzwl -2(%r9,%rdx), %esi + movzwl (%r9), %ebx + movw %si, -2(%r8,%rdx) + movw %bx, (%r8) + jmp L(mm_return) + +L(mm_len_0_16_bytes_forward): + testb $24, %dl + jne L(mm_len_9_16_bytes_forward) + testb $4, %dl + .p2align 4,,5 + jne L(mm_len_5_8_bytes_forward) + test %rdx, %rdx + .p2align 4,,2 + je L(mm_return) + testb $2, %dl + .p2align 4,,1 + jne L(mm_len_2_4_bytes_forward) + movzbl -1(%rsi,%rdx), %ebx + movzbl (%rsi), %esi + movb %bl, -1(%rdi,%rdx) + movb %sil, (%rdi) + jmp L(mm_return) + +L(mm_len_2_4_bytes_forward): + movzwl -2(%rsi,%rdx), %ebx + movzwl (%rsi), %esi + movw %bx, -2(%rdi,%rdx) + movw %si, (%rdi) + jmp L(mm_return) + +L(mm_len_5_8_bytes_forward): + movl (%rsi), %ebx + movl -4(%rsi,%rdx), %esi + movl %ebx, (%rdi) + movl %esi, -4(%rdi,%rdx) + jmp L(mm_return) + +L(mm_len_9_16_bytes_forward): + mov (%rsi), %rbx + mov -8(%rsi, %rdx), %rsi + mov %rbx, (%rdi) + mov %rsi, -8(%rdi, %rdx) + jmp L(mm_return) + +L(mm_recalc_len): +/* Compute in %rdx how many bytes are left to copy after + the main loop stops. */ + mov %rbx, %rdx + sub %rdi, %rdx +/* The code for copying backwards. */ +L(mm_len_0_or_more_backward): + +/* Now do checks for lengths. We do [0..16], [16..32], [32..64], [64..128] + separately. */ + cmp $16, %rdx + jbe L(mm_len_0_16_bytes_backward) + + cmp $32, %rdx + ja L(mm_len_32_or_more_backward) + +/* Copy [0..32] and return. */ + movdqu (%rsi), %xmm0 + movdqu -16(%rsi, %rdx), %xmm1 + movdqu %xmm0, (%rdi) + movdqu %xmm1, -16(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_32_or_more_backward): + cmp $64, %rdx + ja L(mm_len_64_or_more_backward) + +/* Copy [0..64] and return. */ + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu -16(%rsi, %rdx), %xmm2 + movdqu -32(%rsi, %rdx), %xmm3 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, -16(%rdi, %rdx) + movdqu %xmm3, -32(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_64_or_more_backward): + cmp $128, %rdx + ja L(mm_len_128_or_more_backward) + +/* Copy [0..128] and return. */ + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm1 + movdqu 32(%rsi), %xmm2 + movdqu 48(%rsi), %xmm3 + movdqu -64(%rsi, %rdx), %xmm4 + movdqu -48(%rsi, %rdx), %xmm5 + movdqu -32(%rsi, %rdx), %xmm6 + movdqu -16(%rsi, %rdx), %xmm7 + movdqu %xmm0, (%rdi) + movdqu %xmm1, 16(%rdi) + movdqu %xmm2, 32(%rdi) + movdqu %xmm3, 48(%rdi) + movdqu %xmm4, -64(%rdi, %rdx) + movdqu %xmm5, -48(%rdi, %rdx) + movdqu %xmm6, -32(%rdi, %rdx) + movdqu %xmm7, -16(%rdi, %rdx) + jmp L(mm_return) + +L(mm_len_128_or_more_backward): +/* Aligning the address of destination. We need to save + 16 bits from the source in order not to overwrite them. */ + movdqu -16(%rsi, %rdx), %xmm0 + movdqu -32(%rsi, %rdx), %xmm1 + movdqu -48(%rsi, %rdx), %xmm2 + movdqu -64(%rsi, %rdx), %xmm3 + + lea (%rdi, %rdx), %r9 + and $-64, %r9 /* r9 = aligned dst */ + + mov %rsi, %r8 + sub %rdi, %r8 /* r8 = src - dst, diff */ + + movdqu -16(%r9, %r8), %xmm4 + movdqu -32(%r9, %r8), %xmm5 + movdqu -48(%r9, %r8), %xmm6 + movdqu -64(%r9, %r8), %xmm7 + + movdqu %xmm0, -16(%rdi, %rdx) + movdqu %xmm1, -32(%rdi, %rdx) + movdqu %xmm2, -48(%rdi, %rdx) + movdqu %xmm3, -64(%rdi, %rdx) + movdqa %xmm4, -16(%r9) + movaps %xmm5, -32(%r9) + movaps %xmm6, -48(%r9) + movaps %xmm7, -64(%r9) + lea -64(%r9), %r9 + + lea 64(%rdi), %rbx + and $-64, %rbx + + cmp %r9, %rbx + jae L(mm_recalc_len) + + cmp $SHARED_CACHE_SIZE_HALF, %rdx + jae L(mm_large_page_loop_backward) + + .p2align 4 +L(mm_main_loop_backward): + + prefetcht0 -128(%r9, %r8) + + movdqu -64(%r9, %r8), %xmm0 + movdqu -48(%r9, %r8), %xmm1 + movdqu -32(%r9, %r8), %xmm2 + movdqu -16(%r9, %r8), %xmm3 + movdqa %xmm0, -64(%r9) + movaps %xmm1, -48(%r9) + movaps %xmm2, -32(%r9) + movaps %xmm3, -16(%r9) + lea -64(%r9), %r9 + cmp %r9, %rbx + jb L(mm_main_loop_backward) + jmp L(mm_recalc_len) + +/* Copy [0..16] and return. */ +L(mm_len_0_16_bytes_backward): + testb $24, %dl + jnz L(mm_len_9_16_bytes_backward) + testb $4, %dl + .p2align 4,,5 + jnz L(mm_len_5_8_bytes_backward) + test %rdx, %rdx + .p2align 4,,2 + je L(mm_return) + testb $2, %dl + .p2align 4,,1 + jne L(mm_len_3_4_bytes_backward) + movzbl -1(%rsi,%rdx), %ebx + movzbl (%rsi), %ecx + movb %bl, -1(%rdi,%rdx) + movb %cl, (%rdi) + jmp L(mm_return) + +L(mm_len_3_4_bytes_backward): + movzwl -2(%rsi,%rdx), %ebx + movzwl (%rsi), %ecx + movw %bx, -2(%rdi,%rdx) + movw %cx, (%rdi) + jmp L(mm_return) + +L(mm_len_9_16_bytes_backward): + movl -4(%rsi,%rdx), %ebx + movl -8(%rsi,%rdx), %ecx + movl %ebx, -4(%rdi,%rdx) + movl %ecx, -8(%rdi,%rdx) + sub $8, %rdx + jmp L(mm_len_0_16_bytes_backward) + +L(mm_len_5_8_bytes_backward): + movl (%rsi), %ebx + movl -4(%rsi,%rdx), %ecx + movl %ebx, (%rdi) + movl %ecx, -4(%rdi,%rdx) + +L(mm_return): + RETURN + +/* Big length copy forward part. */ + + .p2align 4 +L(mm_large_page_loop_forward): + movdqu (%r8, %rsi), %xmm0 + movdqu 16(%r8, %rsi), %xmm1 + movdqu 32(%r8, %rsi), %xmm2 + movdqu 48(%r8, %rsi), %xmm3 + movntdq %xmm0, (%r8) + movntdq %xmm1, 16(%r8) + movntdq %xmm2, 32(%r8) + movntdq %xmm3, 48(%r8) + lea 64(%r8), %r8 + cmp %r8, %rbx + ja L(mm_large_page_loop_forward) + sfence + jmp L(mm_copy_remaining_forward) + +/* Big length copy backward part. */ + .p2align 4 +L(mm_large_page_loop_backward): + movdqu -64(%r9, %r8), %xmm0 + movdqu -48(%r9, %r8), %xmm1 + movdqu -32(%r9, %r8), %xmm2 + movdqu -16(%r9, %r8), %xmm3 + movntdq %xmm0, -64(%r9) + movntdq %xmm1, -48(%r9) + movntdq %xmm2, -32(%r9) + movntdq %xmm3, -16(%r9) + lea -64(%r9), %r9 + cmp %r9, %rbx + jb L(mm_large_page_loop_backward) + sfence + jmp L(mm_recalc_len) + +END (MEMMOVE) + +ALIAS_SYMBOL(memcpy, MEMMOVE) diff --git a/libs/libc/machine/x86_64/gnu/arch_memset_avx2.S b/libs/libc/machine/x86_64/gnu/arch_memset_avx2.S new file mode 100644 index 0000000000..3b39b0cbd4 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_memset_avx2.S @@ -0,0 +1,179 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_memset_avx2.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#include "cache.h" + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define ENTRY(__f) \ + .text; \ + .global __f; \ + .balign 16; \ + .type __f, @function; \ +__f: \ + .cfi_startproc; + +#define END(__f) \ + .cfi_endproc; \ + .size __f, .- __f; + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.avx2,"ax",@progbits + +ENTRY(memset) + movq %rdi, %rax + and $0xff, %rsi + mov $0x0101010101010101, %rcx + imul %rsi, %rcx + cmpq $16, %rdx + jae L(16bytesormore) + testb $8, %dl + jnz L(8_15bytes) + testb $4, %dl + jnz L(4_7bytes) + testb $2, %dl + jnz L(2_3bytes) + testb $1, %dl + jz 1f + movb %cl, (%rdi) +1: ret + +L(8_15bytes): + movq %rcx, (%rdi) + movq %rcx, -8(%rdi, %rdx) + ret + +L(4_7bytes): + movl %ecx, (%rdi) + movl %ecx, -4(%rdi, %rdx) + ret + +L(2_3bytes): + movw %cx, (%rdi) + movw %cx, -2(%rdi, %rdx) + ret + + ALIGN (4) +L(16bytesormore): + movd %rcx, %xmm0 + pshufd $0, %xmm0, %xmm0 + movdqu %xmm0, (%rdi) + movdqu %xmm0, -16(%rdi, %rdx) + cmpq $32, %rdx + jbe L(done) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, -32(%rdi, %rdx) + cmpq $64, %rdx + jbe L(done) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, 48(%rdi) + movdqu %xmm0, -64(%rdi, %rdx) + movdqu %xmm0, -48(%rdi, %rdx) + cmpq $128, %rdx + jbe L(done) + vpbroadcastb %xmm0, %ymm0 + vmovdqu %ymm0, 64(%rdi) + vmovdqu %ymm0, 96(%rdi) + vmovdqu %ymm0, -128(%rdi, %rdx) + vmovdqu %ymm0, -96(%rdi, %rdx) + cmpq $256, %rdx + jbe L(done) + + ALIGN (4) + leaq 128(%rdi), %rcx + andq $-128, %rcx + movq %rdx, %r8 + addq %rdi, %rdx + andq $-128, %rdx + cmpq %rcx, %rdx + je L(done) + +#ifdef SHARED_CACHE_SIZE + cmp $SHARED_CACHE_SIZE, %r8 +#else + cmp __x86_64_shared_cache_size(%rip), %r8 +#endif + ja L(non_temporal_loop) + + ALIGN (4) +L(normal_loop): + vmovdqa %ymm0, (%rcx) + vmovdqa %ymm0, 32(%rcx) + vmovdqa %ymm0, 64(%rcx) + vmovdqa %ymm0, 96(%rcx) + addq $128, %rcx + cmpq %rcx, %rdx + jne L(normal_loop) + jmp L(done) + + ALIGN (4) +L(non_temporal_loop): + movntdq %xmm0, (%rcx) + movntdq %xmm0, 16(%rcx) + movntdq %xmm0, 32(%rcx) + movntdq %xmm0, 48(%rcx) + movntdq %xmm0, 64(%rcx) + movntdq %xmm0, 80(%rcx) + movntdq %xmm0, 96(%rcx) + movntdq %xmm0, 112(%rcx) + leaq 128(%rcx), %rcx + cmpq %rcx, %rdx + jne L(non_temporal_loop) + /* We used non-temporal stores, so we need a fence here. */ + sfence + +L(done): + /* We used the ymm registers, and that can break SSE2 performance + * unless you do this. + */ + vzeroupper + ret + +END(memset) diff --git a/libs/libc/machine/x86_64/gnu/arch_memset_sse2.S b/libs/libc/machine/x86_64/gnu/arch_memset_sse2.S new file mode 100644 index 0000000000..1b6961c83e --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_memset_sse2.S @@ -0,0 +1,166 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_memset_sse2.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#include "cache.h" + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef ALIGN +# define ALIGN(n) .p2align n +#endif + +#define ENTRY(__f) \ + .text; \ + .global __f; \ + .balign 16; \ + .type __f, @function; \ +__f: \ + .cfi_startproc; + +#define END(__f) \ + .cfi_endproc; \ + .size __f, .- __f; + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.sse2,"ax",@progbits + +ENTRY(memset) + movq %rdi, %rax + and $0xff, %rsi + mov $0x0101010101010101, %rcx + imul %rsi, %rcx + cmpq $16, %rdx + jae L(16bytesormore) + testb $8, %dl + jnz L(8_15bytes) + testb $4, %dl + jnz L(4_7bytes) + testb $2, %dl + jnz L(2_3bytes) + testb $1, %dl + jz L(return) + movb %cl, (%rdi) +L(return): + ret + +L(8_15bytes): + movq %rcx, (%rdi) + movq %rcx, -8(%rdi, %rdx) + ret + +L(4_7bytes): + movl %ecx, (%rdi) + movl %ecx, -4(%rdi, %rdx) + ret + +L(2_3bytes): + movw %cx, (%rdi) + movw %cx, -2(%rdi, %rdx) + ret + + ALIGN (4) +L(16bytesormore): + movd %rcx, %xmm0 + pshufd $0, %xmm0, %xmm0 + movdqu %xmm0, (%rdi) + movdqu %xmm0, -16(%rdi, %rdx) + cmpq $32, %rdx + jbe L(32bytesless) + movdqu %xmm0, 16(%rdi) + movdqu %xmm0, -32(%rdi, %rdx) + cmpq $64, %rdx + jbe L(64bytesless) + movdqu %xmm0, 32(%rdi) + movdqu %xmm0, 48(%rdi) + movdqu %xmm0, -64(%rdi, %rdx) + movdqu %xmm0, -48(%rdi, %rdx) + cmpq $128, %rdx + ja L(128bytesmore) +L(32bytesless): +L(64bytesless): + ret + + ALIGN (4) +L(128bytesmore): + leaq 64(%rdi), %rcx + andq $-64, %rcx + movq %rdx, %r8 + addq %rdi, %rdx + andq $-64, %rdx + cmpq %rcx, %rdx + je L(return) + +#ifdef SHARED_CACHE_SIZE + cmp $SHARED_CACHE_SIZE, %r8 +#else + cmp __x86_64_shared_cache_size(%rip), %r8 +#endif + ja L(128bytesmore_nt) + + ALIGN (4) +L(128bytesmore_normal): + movdqa %xmm0, (%rcx) + movaps %xmm0, 0x10(%rcx) + movaps %xmm0, 0x20(%rcx) + movaps %xmm0, 0x30(%rcx) + addq $64, %rcx + cmpq %rcx, %rdx + jne L(128bytesmore_normal) + ret + + ALIGN (4) +L(128bytesmore_nt): + movntdq %xmm0, (%rcx) + movntdq %xmm0, 0x10(%rcx) + movntdq %xmm0, 0x20(%rcx) + movntdq %xmm0, 0x30(%rcx) + leaq 64(%rcx), %rcx + cmpq %rcx, %rdx + jne L(128bytesmore_nt) + sfence + ret + +END(memset) diff --git a/libs/libc/machine/x86_64/gnu/arch_stpcpy.S b/libs/libc/machine/x86_64/gnu/arch_stpcpy.S new file mode 100644 index 0000000000..d9a2bd8eb6 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_stpcpy.S @@ -0,0 +1,40 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_stpcpy.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#define USE_AS_STPCPY +#define STRCPY stpcpy +#include "arch_strcpy.S" diff --git a/libs/libc/machine/x86_64/gnu/arch_stpncpy.S b/libs/libc/machine/x86_64/gnu/arch_stpncpy.S new file mode 100644 index 0000000000..cff948d791 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_stpncpy.S @@ -0,0 +1,41 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strpcpy.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#define USE_AS_STRNCPY +#define USE_AS_STPCPY +#define STRCPY stpncpy +#include "arch_strcpy.S" diff --git a/libs/libc/machine/x86_64/gnu/arch_strcat.S b/libs/libc/machine/x86_64/gnu/arch_strcat.S new file mode 100644 index 0000000000..f76031554f --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strcat.S @@ -0,0 +1,98 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strcat.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef STRCAT +# define STRCAT strcat +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif + +#define USE_AS_STRCAT + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + +.text +ENTRY (STRCAT) + mov %rdi, %r9 +#ifdef USE_AS_STRNCAT + mov %rdx, %r8 +#endif + +#define RETURN jmp L(Strcpy) +#include "arch_strlen.S" + +#undef RETURN +#define RETURN ret + +L(Strcpy): + lea (%r9, %rax), %rdi + mov %rsi, %rcx + mov %r9, %rax /* save result */ + +#ifdef USE_AS_STRNCAT + test %r8, %r8 + jz L(ExitZero) +# define USE_AS_STRNCPY +#endif +#include "arch_strcpy.S" diff --git a/libs/libc/machine/x86_64/gnu/arch_strcmp.S b/libs/libc/machine/x86_64/gnu/arch_strcmp.S new file mode 100644 index 0000000000..4b75831b78 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strcmp.S @@ -0,0 +1,1937 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strcmp.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifdef USE_AS_STRNCMP +/* Since the counter, %r11, is unsigned, we branch to strcmp_exitz + if the new counter > the old one or is 0. */ +#define UPDATE_STRNCMP_COUNTER \ + /* calculate left number to compare */ \ + lea -16(%rcx, %r11), %r9; \ + cmp %r9, %r11; \ + jb L(strcmp_exitz); \ + test %r9, %r9; \ + je L(strcmp_exitz); \ + mov %r9, %r11 + +#else +#define UPDATE_STRNCMP_COUNTER +#ifndef STRCMP +#define STRCMP strcmp +#endif +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif +#define RETURN ret + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.ssse3,"ax",@progbits +ENTRY (STRCMP) +/* + * This implementation uses SSE to compare up to 16 bytes at a time. + */ +#ifdef USE_AS_STRNCMP + test %rdx, %rdx + je L(strcmp_exitz) + cmp $1, %rdx + je L(Byte0) + mov %rdx, %r11 +#endif + mov %esi, %ecx + mov %edi, %eax +/* Use 64bit AND here to avoid long NOP padding. */ + and $0x3f, %rcx /* rsi alignment in cache line */ + and $0x3f, %rax /* rdi alignment in cache line */ + cmp $0x30, %ecx + ja L(crosscache) /* rsi: 16-byte load will cross cache line */ + cmp $0x30, %eax + ja L(crosscache) /* rdi: 16-byte load will cross cache line */ + movlpd (%rdi), %xmm1 + movlpd (%rsi), %xmm2 + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz L(less16bytes) /* If not, find different value or null char */ +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) /* finish comparision */ +#endif + add $16, %rsi /* prepare to search next 16 bytes */ + add $16, %rdi /* prepare to search next 16 bytes */ + + /* + * Determine source and destination string offsets from 16-byte alignment. + * Use relative offset difference between the two to determine which case + * below to use. + */ + .p2align 4 +L(crosscache): + and $0xfffffffffffffff0, %rsi /* force %rsi is 16 byte aligned */ + and $0xfffffffffffffff0, %rdi /* force %rdi is 16 byte aligned */ + mov $0xffff, %edx /* for equivalent offset */ + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ + cmp %eax, %ecx + je L(ashr_0) /* rsi and rdi relative offset same */ + ja L(bigger) + mov %edx, %r8d /* r8d is offset flag for exit tail */ + xchg %ecx, %eax + xchg %rsi, %rdi +L(bigger): + lea 15(%rax), %r9 + sub %rcx, %r9 + lea L(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 + lea (%r10, %r9), %r10 + jmp *%r10 /* jump to corresponding case */ + +/* + * The following cases will be handled by ashr_0 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(0~15) n(0~15) 15(15+ n-n) ashr_0 + */ + .p2align 4 +L(ashr_0): + + movdqa (%rsi), %xmm1 + pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + /* + * edx must be the same with r9d if in left byte (16-rcx) is equal to + * the start from (16-rax) and no null char was seen. + */ + jne L(less32bytes) /* mismatch or null char */ + UPDATE_STRNCMP_COUNTER + mov $16, %rcx + mov $16, %r9 + pxor %xmm0, %xmm0 /* clear xmm0, may have changed above */ + + /* + * Now both strings are aligned at 16-byte boundary. Loop over strings + * checking 32-bytes per iteration. + */ + .p2align 4 +L(loop_ashr_0): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) /* mismatch or null char seen */ + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + add $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + add $16, %rcx + jmp L(loop_ashr_0) + +/* + * The following cases will be handled by ashr_1 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(15) n -15 0(15 +(n-15) - n) ashr_1 + */ + .p2align 4 +L(ashr_1): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + pslldq $15, %xmm2 /* shift first string to align with second */ + pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ + psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ + sub %r9d, %edx + jnz L(less32bytes) /* mismatch or null char seen */ + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads*/ + mov $1, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 1(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_1): + add $16, %r10 + jg L(nibble_ashr_1) /* cross page boundary */ + +L(gobble_ashr_1): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_1) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 /* store for next cycle */ + + palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_1) + + /* + * Nibble avoids loads across page boundary. This is to avoid a potential + * access into unmapped memory. + */ + .p2align 4 +L(nibble_ashr_1): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char*/ + pmovmskb %xmm0, %edx + test $0xfffe, %edx + jnz L(ashr_1_exittail) /* find null char*/ + +#ifdef USE_AS_STRNCMP + cmp $14, %r11 + jbe L(ashr_1_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 /* substract 4K from %r10 */ + jmp L(gobble_ashr_1) + + /* + * Once find null char, determine if there is a string mismatch + * before the null char. + */ + .p2align 4 +L(ashr_1_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $1, %xmm0 + psrldq $1, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_2 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(14~15) n -14 1(15 +(n-14) - n) ashr_2 + */ + .p2align 4 +L(ashr_2): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $14, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $2, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 2(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_2): + add $16, %r10 + jg L(nibble_ashr_2) + +L(gobble_ashr_2): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_2) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_2) + + .p2align 4 +L(nibble_ashr_2): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfffc, %edx + jnz L(ashr_2_exittail) + +#ifdef USE_AS_STRNCMP + cmp $13, %r11 + jbe L(ashr_2_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_2) + + .p2align 4 +L(ashr_2_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $2, %xmm0 + psrldq $2, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_3 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(13~15) n -13 2(15 +(n-13) - n) ashr_3 + */ + .p2align 4 +L(ashr_3): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $13, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $3, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 3(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_3): + add $16, %r10 + jg L(nibble_ashr_3) + +L(gobble_ashr_3): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_3) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_3) + + .p2align 4 +L(nibble_ashr_3): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff8, %edx + jnz L(ashr_3_exittail) + +#ifdef USE_AS_STRNCMP + cmp $12, %r11 + jbe L(ashr_3_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_3) + + .p2align 4 +L(ashr_3_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $3, %xmm0 + psrldq $3, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_4 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(12~15) n -12 3(15 +(n-12) - n) ashr_4 + */ + .p2align 4 +L(ashr_4): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $12, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $4, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 4(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_4): + add $16, %r10 + jg L(nibble_ashr_4) + +L(gobble_ashr_4): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_4) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_4) + + .p2align 4 +L(nibble_ashr_4): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfff0, %edx + jnz L(ashr_4_exittail) + +#ifdef USE_AS_STRNCMP + cmp $11, %r11 + jbe L(ashr_4_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_4) + + .p2align 4 +L(ashr_4_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $4, %xmm0 + psrldq $4, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_5 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(11~15) n - 11 4(15 +(n-11) - n) ashr_5 + */ + .p2align 4 +L(ashr_5): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $11, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $5, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 5(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_5): + add $16, %r10 + jg L(nibble_ashr_5) + +L(gobble_ashr_5): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_5) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_5) + + .p2align 4 +L(nibble_ashr_5): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffe0, %edx + jnz L(ashr_5_exittail) + +#ifdef USE_AS_STRNCMP + cmp $10, %r11 + jbe L(ashr_5_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_5) + + .p2align 4 +L(ashr_5_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $5, %xmm0 + psrldq $5, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_6 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(10~15) n - 10 5(15 +(n-10) - n) ashr_6 + */ + .p2align 4 +L(ashr_6): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $10, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $6, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 6(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_6): + add $16, %r10 + jg L(nibble_ashr_6) + +L(gobble_ashr_6): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_6) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_6) + + .p2align 4 +L(nibble_ashr_6): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xffc0, %edx + jnz L(ashr_6_exittail) + +#ifdef USE_AS_STRNCMP + cmp $9, %r11 + jbe L(ashr_6_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_6) + + .p2align 4 +L(ashr_6_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $6, %xmm0 + psrldq $6, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_7 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(9~15) n - 9 6(15 +(n - 9) - n) ashr_7 + */ + .p2align 4 +L(ashr_7): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $9, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $7, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 7(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_7): + add $16, %r10 + jg L(nibble_ashr_7) + +L(gobble_ashr_7): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_7) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_7) + + .p2align 4 +L(nibble_ashr_7): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff80, %edx + jnz L(ashr_7_exittail) + +#ifdef USE_AS_STRNCMP + cmp $8, %r11 + jbe L(ashr_7_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_7) + + .p2align 4 +L(ashr_7_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $7, %xmm0 + psrldq $7, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_8 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(8~15) n - 8 7(15 +(n - 8) - n) ashr_8 + */ + .p2align 4 +L(ashr_8): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $8, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $8, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 8(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_8): + add $16, %r10 + jg L(nibble_ashr_8) + +L(gobble_ashr_8): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_8) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_8) + + .p2align 4 +L(nibble_ashr_8): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xff00, %edx + jnz L(ashr_8_exittail) + +#ifdef USE_AS_STRNCMP + cmp $7, %r11 + jbe L(ashr_8_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_8) + + .p2align 4 +L(ashr_8_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $8, %xmm0 + psrldq $8, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_9 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(7~15) n - 7 8(15 +(n - 7) - n) ashr_9 + */ + .p2align 4 +L(ashr_9): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $7, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $9, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 9(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_9): + add $16, %r10 + jg L(nibble_ashr_9) + +L(gobble_ashr_9): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_9) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 /* store for next cycle */ + jmp L(loop_ashr_9) + + .p2align 4 +L(nibble_ashr_9): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfe00, %edx + jnz L(ashr_9_exittail) + +#ifdef USE_AS_STRNCMP + cmp $6, %r11 + jbe L(ashr_9_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_9) + + .p2align 4 +L(ashr_9_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $9, %xmm0 + psrldq $9, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_10 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(6~15) n - 6 9(15 +(n - 6) - n) ashr_10 + */ + .p2align 4 +L(ashr_10): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $6, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $10, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 10(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_10): + add $16, %r10 + jg L(nibble_ashr_10) + +L(gobble_ashr_10): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_10) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_10) + + .p2align 4 +L(nibble_ashr_10): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xfc00, %edx + jnz L(ashr_10_exittail) + +#ifdef USE_AS_STRNCMP + cmp $5, %r11 + jbe L(ashr_10_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_10) + + .p2align 4 +L(ashr_10_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $10, %xmm0 + psrldq $10, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_11 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(5~15) n - 5 10(15 +(n - 5) - n) ashr_11 + */ + .p2align 4 +L(ashr_11): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $5, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $11, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 11(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_11): + add $16, %r10 + jg L(nibble_ashr_11) + +L(gobble_ashr_11): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_11) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_11) + + .p2align 4 +L(nibble_ashr_11): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf800, %edx + jnz L(ashr_11_exittail) + +#ifdef USE_AS_STRNCMP + cmp $4, %r11 + jbe L(ashr_11_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_11) + + .p2align 4 +L(ashr_11_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $11, %xmm0 + psrldq $11, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_12 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(4~15) n - 4 11(15 +(n - 4) - n) ashr_12 + */ + .p2align 4 +L(ashr_12): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $4, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $12, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 12(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_12): + add $16, %r10 + jg L(nibble_ashr_12) + +L(gobble_ashr_12): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_12) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_12) + + .p2align 4 +L(nibble_ashr_12): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xf000, %edx + jnz L(ashr_12_exittail) + +#ifdef USE_AS_STRNCMP + cmp $3, %r11 + jbe L(ashr_12_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_12) + + .p2align 4 +L(ashr_12_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $12, %xmm0 + psrldq $12, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_13 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(3~15) n - 3 12(15 +(n - 3) - n) ashr_13 + */ + .p2align 4 +L(ashr_13): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $3, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $13, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 13(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_13): + add $16, %r10 + jg L(nibble_ashr_13) + +L(gobble_ashr_13): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_13) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_13) + + .p2align 4 +L(nibble_ashr_13): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xe000, %edx + jnz L(ashr_13_exittail) + +#ifdef USE_AS_STRNCMP + cmp $2, %r11 + jbe L(ashr_13_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_13) + + .p2align 4 +L(ashr_13_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $13, %xmm0 + psrldq $13, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_14 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(2~15) n - 2 13(15 +(n - 2) - n) ashr_14 + */ + .p2align 4 +L(ashr_14): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $2, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $14, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 14(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_14): + add $16, %r10 + jg L(nibble_ashr_14) + +L(gobble_ashr_14): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_14) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_14) + + .p2align 4 +L(nibble_ashr_14): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0xc000, %edx + jnz L(ashr_14_exittail) + +#ifdef USE_AS_STRNCMP + cmp $1, %r11 + jbe L(ashr_14_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_14) + + .p2align 4 +L(ashr_14_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $14, %xmm0 + psrldq $14, %xmm3 + jmp L(aftertail) + +/* + * The following cases will be handled by ashr_15 + * rcx(offset of rsi) rax(offset of rdi) relative offset corresponding case + * n(1~15) n - 1 14(15 +(n - 1) - n) ashr_15 + */ + .p2align 4 +L(ashr_15): + pxor %xmm0, %xmm0 + movdqa (%rdi), %xmm2 + movdqa (%rsi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pslldq $1, %xmm2 + pcmpeqb %xmm1, %xmm2 + psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d + sub %r9d, %edx + jnz L(less32bytes) + + movdqa (%rdi), %xmm3 + + UPDATE_STRNCMP_COUNTER + + pxor %xmm0, %xmm0 + mov $16, %rcx /* index for loads */ + mov $15, %r9d /* byte position left over from less32bytes case */ + /* + * Setup %r10 value allows us to detect crossing a page boundary. + * When %r10 goes positive we have crossed a page boundary and + * need to do a nibble. + */ + lea 15(%rdi), %r10 + and $0xfff, %r10 /* offset into 4K page */ + + sub $0x1000, %r10 /* subtract 4K pagesize */ + + .p2align 4 +L(loop_ashr_15): + add $16, %r10 + jg L(nibble_ashr_15) + +L(gobble_ashr_15): + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + + add $16, %r10 + jg L(nibble_ashr_15) /* cross page boundary */ + + movdqa (%rsi, %rcx), %xmm1 + movdqa (%rdi, %rcx), %xmm2 + movdqa %xmm2, %xmm4 + + palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ + + pcmpeqb %xmm1, %xmm0 + pcmpeqb %xmm2, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + sub $0xffff, %edx + jnz L(exit) + +#ifdef USE_AS_STRNCMP + sub $16, %r11 + jbe L(strcmp_exitz) +#endif + + add $16, %rcx + movdqa %xmm4, %xmm3 + jmp L(loop_ashr_15) + + .p2align 4 +L(nibble_ashr_15): + pcmpeqb %xmm3, %xmm0 /* check nibble for null char */ + pmovmskb %xmm0, %edx + test $0x8000, %edx + jnz L(ashr_15_exittail) + +#ifdef USE_AS_STRNCMP + test %r11, %r11 + je L(ashr_15_exittail) +#endif + + pxor %xmm0, %xmm0 + sub $0x1000, %r10 + jmp L(gobble_ashr_15) + + .p2align 4 +L(ashr_15_exittail): + movdqa (%rsi, %rcx), %xmm1 + psrldq $15, %xmm3 + psrldq $15, %xmm0 + + .p2align 4 +L(aftertail): + pcmpeqb %xmm3, %xmm1 + psubb %xmm0, %xmm1 + pmovmskb %xmm1, %edx + not %edx + + .p2align 4 +L(exit): + lea -16(%r9, %rcx), %rax /* locate the exact offset for rdi */ +L(less32bytes): + lea (%rdi, %rax), %rdi /* locate the exact address for first operand(rdi) */ + lea (%rsi, %rcx), %rsi /* locate the exact address for second operand(rsi) */ + test %r8d, %r8d + jz L(ret) + xchg %rsi, %rdi /* recover original order according to flag(%r8d) */ + + .p2align 4 +L(ret): +L(less16bytes): + bsf %rdx, %rdx /* find and store bit index in %rdx */ + +#ifdef USE_AS_STRNCMP + sub %rdx, %r11 + jbe L(strcmp_exitz) +#endif + movzbl (%rsi, %rdx), %ecx + movzbl (%rdi, %rdx), %eax + + sub %ecx, %eax + ret + +L(strcmp_exitz): + xor %eax, %eax + ret + + .p2align 4 +L(Byte0): + movzbl (%rsi), %ecx + movzbl (%rdi), %eax + + sub %ecx, %eax + ret +END (STRCMP) + + .section .rodata,"a",@progbits + .p2align 3 +L(unaligned_table): + .int L(ashr_1) - L(unaligned_table) + .int L(ashr_2) - L(unaligned_table) + .int L(ashr_3) - L(unaligned_table) + .int L(ashr_4) - L(unaligned_table) + .int L(ashr_5) - L(unaligned_table) + .int L(ashr_6) - L(unaligned_table) + .int L(ashr_7) - L(unaligned_table) + .int L(ashr_8) - L(unaligned_table) + .int L(ashr_9) - L(unaligned_table) + .int L(ashr_10) - L(unaligned_table) + .int L(ashr_11) - L(unaligned_table) + .int L(ashr_12) - L(unaligned_table) + .int L(ashr_13) - L(unaligned_table) + .int L(ashr_14) - L(unaligned_table) + .int L(ashr_15) - L(unaligned_table) + .int L(ashr_0) - L(unaligned_table) diff --git a/libs/libc/machine/x86_64/gnu/arch_strcpy.S b/libs/libc/machine/x86_64/gnu/arch_strcpy.S new file mode 100644 index 0000000000..7164518fd9 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strcpy.S @@ -0,0 +1,1932 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strcpy.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef USE_AS_STRCAT + +# ifndef STRCPY +# define STRCPY strcpy +# endif + +# ifndef L +# define L(label) .L##label +# endif + +# ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +# endif + +# ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +# endif + +# ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +# endif + +# ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +# endif + +#endif + +#define JMPTBL(I, B) I - B +#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ + lea TABLE(%rip), %r11; \ + movslq (%r11, INDEX, SCALE), %rcx; \ + lea (%r11, %rcx), %rcx; \ + jmp *%rcx + +#ifndef USE_AS_STRCAT + +# define RETURN ret + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + +.text +ENTRY (STRCPY) +# ifdef USE_AS_STRNCPY + mov %rdx, %r8 + test %r8, %r8 + jz L(ExitZero) +# endif + mov %rsi, %rcx +# ifndef USE_AS_STPCPY + mov %rdi, %rax /* save result */ +# endif + +#endif + and $63, %rcx + cmp $32, %rcx + jbe L(SourceStringAlignmentLess32) + + and $-16, %rsi + and $15, %rcx + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + + pcmpeqb (%rsi), %xmm1 + pmovmskb %xmm1, %rdx + shr %cl, %rdx +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + mov $16, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# else + mov $17, %r10 + sub %rcx, %r10 + cmp %r10, %r8 +# endif + jbe L(CopyFrom1To16BytesTailCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail) + + pcmpeqb 16(%rsi), %xmm0 + pmovmskb %xmm0, %rdx +#ifdef USE_AS_STRNCPY + add $16, %r10 + cmp %r10, %r8 + jbe L(CopyFrom1To32BytesCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes) + + movdqu (%rsi, %rcx), %xmm1 /* copy 16 bytes */ + movdqu %xmm1, (%rdi) + +/* If source adress alignment != destination adress alignment */ + .p2align 4 +L(Unalign16Both): + sub %rcx, %rdi +#ifdef USE_AS_STRNCPY + add %rcx, %r8 +#endif + mov $16, %rcx + movdqa (%rsi, %rcx), %xmm1 + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $48, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm4 + movdqu %xmm3, (%rdi, %rcx) + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm1 + movdqu %xmm4, (%rdi, %rcx) + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm1) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm2 + movdqu %xmm1, (%rdi, %rcx) + pcmpeqb %xmm2, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm2) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movaps 16(%rsi, %rcx), %xmm3 + movdqu %xmm2, (%rdi, %rcx) + pcmpeqb %xmm3, %xmm0 + pmovmskb %xmm0, %rdx + add $16, %rcx +#ifdef USE_AS_STRNCPY + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) +#endif + test %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm3) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + movdqu %xmm3, (%rdi, %rcx) + mov %rsi, %rdx + lea 16(%rsi, %rcx), %rsi + and $-0x40, %rsi + sub %rsi, %rdx + sub %rdx, %rdi +#ifdef USE_AS_STRNCPY + lea 128(%r8, %rdx), %r8 +#endif +L(Unaligned64Loop): + movaps (%rsi), %xmm2 + movaps %xmm2, %xmm4 + movaps 16(%rsi), %xmm5 + movaps 32(%rsi), %xmm3 + movaps %xmm3, %xmm6 + movaps 48(%rsi), %xmm7 + pminub %xmm5, %xmm2 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +#ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %rdx, %rdx + jnz L(Unaligned64Leave) + +L(Unaligned64Loop_start): + add $64, %rdi + add $64, %rsi + movdqu %xmm4, -64(%rdi) + movaps (%rsi), %xmm2 + movdqa %xmm2, %xmm4 + movdqu %xmm5, -48(%rdi) + movaps 16(%rsi), %xmm5 + pminub %xmm5, %xmm2 + movaps 32(%rsi), %xmm3 + movdqu %xmm6, -32(%rdi) + movaps %xmm3, %xmm6 + movdqu %xmm7, -16(%rdi) + movaps 48(%rsi), %xmm7 + pminub %xmm7, %xmm3 + pminub %xmm2, %xmm3 + pcmpeqb %xmm0, %xmm3 + pmovmskb %xmm3, %rdx +#ifdef USE_AS_STRNCPY + sub $64, %r8 + jbe L(UnalignedLeaveCase2OrCase3) +#endif + test %rdx, %rdx + jz L(Unaligned64Loop_start) + +L(Unaligned64Leave): + pxor %xmm1, %xmm1 + + pcmpeqb %xmm4, %xmm0 + pcmpeqb %xmm5, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_0) + test %rcx, %rcx + jnz L(CopyFrom1To16BytesUnaligned_16) + + pcmpeqb %xmm6, %xmm0 + pcmpeqb %xmm7, %xmm1 + pmovmskb %xmm0, %rdx + pmovmskb %xmm1, %rcx + test %rdx, %rdx + jnz L(CopyFrom1To16BytesUnaligned_32) + + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) + movdqu %xmm6, 32(%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 48(%rdi, %rdx), %rax +# endif + movdqu %xmm7, 48(%rdi) + add $15, %r8 + sub %rdx, %r8 + lea 49(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $48, %rsi + add $48, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + +/* If source adress alignment == destination adress alignment */ + +L(SourceStringAlignmentLess32): + pxor %xmm0, %xmm0 + movdqu (%rsi), %xmm1 + movdqu 16(%rsi), %xmm2 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %rdx + +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $16, %r8 +# else + cmp $17, %r8 +# endif + jbe L(CopyFrom1To16BytesTail1Case2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1) + + pcmpeqb %xmm2, %xmm0 + movdqu %xmm1, (%rdi) + pmovmskb %xmm0, %rdx + +#ifdef USE_AS_STRNCPY +# if defined USE_AS_STPCPY || defined USE_AS_STRCAT + cmp $32, %r8 +# else + cmp $33, %r8 +# endif + jbe L(CopyFrom1To32Bytes1Case2OrCase3) +#endif + test %rdx, %rdx + jnz L(CopyFrom1To32Bytes1) + + and $15, %rcx + and $-16, %rsi + + jmp L(Unalign16Both) + +/*------End of main part with loops---------------------*/ + +/* Case1 */ + +#if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) + .p2align 4 +L(CopyFrom1To16Bytes): + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + .p2align 4 +L(CopyFrom1To16BytesTail): + add %rcx, %rsi + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1): + add $16, %rsi + add $16, %rdi +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 +#endif +L(CopyFrom1To16BytesTail1): + bsf %rdx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To32Bytes): + bsf %rdx, %rdx + add %rcx, %rsi + add $16, %rdx + sub %rcx, %rdx + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_0): + bsf %rdx, %rdx +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +# endif + movdqu %xmm4, (%rdi) + add $63, %r8 + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_16): + bsf %rcx, %rdx + movdqu %xmm4, (%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 16(%rdi, %rdx), %rax +# endif + movdqu %xmm5, 16(%rdi) + add $47, %r8 + sub %rdx, %r8 + lea 17(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $16, %rsi + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + + .p2align 4 +L(CopyFrom1To16BytesUnaligned_32): + bsf %rdx, %rdx + movdqu %xmm4, (%rdi) + movdqu %xmm5, 16(%rdi) +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT +# ifdef USE_AS_STPCPY + lea 32(%rdi, %rdx), %rax +# endif + movdqu %xmm6, 32(%rdi) + add $31, %r8 + sub %rdx, %r8 + lea 33(%rdi, %rdx), %rdi + jmp L(StrncpyFillTailWithZero) +#else + add $32, %rsi + add $32, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) +#endif + +#ifdef USE_AS_STRNCPY +# ifndef USE_AS_STRCAT + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm6): + movdqu %xmm6, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm5): + movdqu %xmm5, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm4): + movdqu %xmm4, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm3): + movdqu %xmm3, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm1): + movdqu %xmm1, (%rdi, %rcx) + jmp L(CopyFrom1To16BytesXmmExit) +# endif + + .p2align 4 +L(CopyFrom1To16BytesExit): + BRANCH_TO_JMPTBL_ENTRY (L(ExitTable), %rdx, 4) + +/* Case2 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2): + add %rcx, %rsi + bsf %rdx, %rdx + add $16, %rdx + sub %rcx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTailCase2): + add %rcx, %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +L(CopyFrom1To16BytesTail1Case2): + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +/* Case2 or Case3, Case3 */ + + .p2align 4 +L(CopyFrom1To16BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesCase2) +L(CopyFrom1To16BytesCase3): + add $16, %r8 + add %rcx, %rdi + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32BytesCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To32BytesCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To16BytesTailCase2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTailCase2) + add %rcx, %rsi + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(CopyFrom1To32Bytes1Case2OrCase3): + add $16, %rdi + add $16, %rsi + sub $16, %r8 +L(CopyFrom1To16BytesTail1Case2OrCase3): + test %rdx, %rdx + jnz L(CopyFrom1To16BytesTail1Case2) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + +#endif + +/*------------End labels regarding with copying 1-16 bytes--and 1-32 bytes----*/ + + .p2align 4 +L(Exit1): + mov %dh, (%rdi) +#ifdef USE_AS_STPCPY + lea (%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $1, %r8 + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit2): + mov (%rsi), %dx + mov %dx, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $2, %r8 + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit3): + mov (%rsi), %cx + mov %cx, (%rdi) + mov %dh, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $3, %r8 + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit4): + mov (%rsi), %edx + mov %edx, (%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $4, %r8 + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit5): + mov (%rsi), %ecx + mov %dh, 4(%rdi) + mov %ecx, (%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $5, %r8 + lea 5(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $6, %r8 + lea 6(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $7, %r8 + lea 7(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $8, %r8 + lea 8(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit9): + mov (%rsi), %rcx + mov %dh, 8(%rdi) + mov %rcx, (%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $9, %r8 + lea 9(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $10, %r8 + lea 10(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $11, %r8 + lea 11(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $12, %r8 + lea 12(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $13, %r8 + lea 13(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $14, %r8 + lea 14(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $15, %r8 + lea 15(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $16, %r8 + lea 16(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit17): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) + mov %dh, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $17, %r8 + lea 17(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $18, %r8 + lea 18(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $19, %r8 + lea 19(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $20, %r8 + lea 20(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dh, 20(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $21, %r8 + lea 21(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $22, %r8 + lea 22(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $23, %r8 + lea 23(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $24, %r8 + lea 24(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) + mov %dh, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $25, %r8 + lea 25(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $26, %r8 + lea 26(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $27, %r8 + lea 27(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $28, %r8 + lea 28(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $29, %r8 + lea 29(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $30, %r8 + lea 30(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $31, %r8 + lea 31(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + + .p2align 4 +L(Exit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT + sub $32, %r8 + lea 32(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) +#endif + RETURN + +#ifdef USE_AS_STRNCPY + + .p2align 4 +L(StrncpyExit0): +#ifdef USE_AS_STPCPY + mov %rdi, %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, (%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit1): + mov (%rsi), %dl + mov %dl, (%rdi) +#ifdef USE_AS_STPCPY + lea 1(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 1(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit2): + mov (%rsi), %dx + mov %dx, (%rdi) +#ifdef USE_AS_STPCPY + lea 2(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 2(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit3): + mov (%rsi), %cx + mov 2(%rsi), %dl + mov %cx, (%rdi) + mov %dl, 2(%rdi) +#ifdef USE_AS_STPCPY + lea 3(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 3(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit4): + mov (%rsi), %edx + mov %edx, (%rdi) +#ifdef USE_AS_STPCPY + lea 4(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 4(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit5): + mov (%rsi), %ecx + mov 4(%rsi), %dl + mov %ecx, (%rdi) + mov %dl, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 5(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 5(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit6): + mov (%rsi), %ecx + mov 4(%rsi), %dx + mov %ecx, (%rdi) + mov %dx, 4(%rdi) +#ifdef USE_AS_STPCPY + lea 6(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 6(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit7): + mov (%rsi), %ecx + mov 3(%rsi), %edx + mov %ecx, (%rdi) + mov %edx, 3(%rdi) +#ifdef USE_AS_STPCPY + lea 7(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 7(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit8): + mov (%rsi), %rdx + mov %rdx, (%rdi) +#ifdef USE_AS_STPCPY + lea 8(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 8(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit9): + mov (%rsi), %rcx + mov 8(%rsi), %dl + mov %rcx, (%rdi) + mov %dl, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 9(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 9(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit10): + mov (%rsi), %rcx + mov 8(%rsi), %dx + mov %rcx, (%rdi) + mov %dx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 10(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 10(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit11): + mov (%rsi), %rcx + mov 7(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 11(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 11(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit12): + mov (%rsi), %rcx + mov 8(%rsi), %edx + mov %rcx, (%rdi) + mov %edx, 8(%rdi) +#ifdef USE_AS_STPCPY + lea 12(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 12(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit13): + mov (%rsi), %rcx + mov 5(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 5(%rdi) +#ifdef USE_AS_STPCPY + lea 13(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 13(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit14): + mov (%rsi), %rcx + mov 6(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 6(%rdi) +#ifdef USE_AS_STPCPY + lea 14(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 14(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit15): + mov (%rsi), %rcx + mov 7(%rsi), %rdx + mov %rcx, (%rdi) + mov %rdx, 7(%rdi) +#ifdef USE_AS_STPCPY + lea 15(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 15(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit16): + movdqu (%rsi), %xmm0 + movdqu %xmm0, (%rdi) +#ifdef USE_AS_STPCPY + lea 16(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 16(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit17): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %cl, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 17(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 17(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit18): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %cx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 18(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 18(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit19): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 19(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 19(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit20): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 20(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 20(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit21): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %ecx + mov 20(%rsi), %dl + movdqu %xmm0, (%rdi) + mov %ecx, 16(%rdi) + mov %dl, 20(%rdi) +#ifdef USE_AS_STPCPY + lea 21(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 21(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit22): + movdqu (%rsi), %xmm0 + mov 14(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 22(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 22(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit23): + movdqu (%rsi), %xmm0 + mov 15(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 23(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 23(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit24): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rcx + movdqu %xmm0, (%rdi) + mov %rcx, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 24(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 24(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit25): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cl + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cl, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 25(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 25(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit26): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %cx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %cx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 26(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 26(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit27): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 23(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 23(%rdi) +#ifdef USE_AS_STPCPY + lea 27(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 27(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit28): + movdqu (%rsi), %xmm0 + mov 16(%rsi), %rdx + mov 24(%rsi), %ecx + movdqu %xmm0, (%rdi) + mov %rdx, 16(%rdi) + mov %ecx, 24(%rdi) +#ifdef USE_AS_STPCPY + lea 28(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 28(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit29): + movdqu (%rsi), %xmm0 + movdqu 13(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 13(%rdi) +#ifdef USE_AS_STPCPY + lea 29(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 29(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit30): + movdqu (%rsi), %xmm0 + movdqu 14(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 14(%rdi) +#ifdef USE_AS_STPCPY + lea 30(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 30(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit31): + movdqu (%rsi), %xmm0 + movdqu 15(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 15(%rdi) +#ifdef USE_AS_STPCPY + lea 31(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 31(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit32): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) +#ifdef USE_AS_STPCPY + lea 32(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 32(%rdi) +#endif + RETURN + + .p2align 4 +L(StrncpyExit33): + movdqu (%rsi), %xmm0 + movdqu 16(%rsi), %xmm2 + mov 32(%rsi), %cl + movdqu %xmm0, (%rdi) + movdqu %xmm2, 16(%rdi) + mov %cl, 32(%rdi) +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 33(%rdi) +#endif + RETURN + +#ifndef USE_AS_STRCAT + + .p2align 4 +L(Fill0): + RETURN + + .p2align 4 +L(Fill1): + mov %dl, (%rdi) + RETURN + + .p2align 4 +L(Fill2): + mov %dx, (%rdi) + RETURN + + .p2align 4 +L(Fill3): + mov %edx, -1(%rdi) + RETURN + + .p2align 4 +L(Fill4): + mov %edx, (%rdi) + RETURN + + .p2align 4 +L(Fill5): + mov %edx, (%rdi) + mov %dl, 4(%rdi) + RETURN + + .p2align 4 +L(Fill6): + mov %edx, (%rdi) + mov %dx, 4(%rdi) + RETURN + + .p2align 4 +L(Fill7): + mov %rdx, -1(%rdi) + RETURN + + .p2align 4 +L(Fill8): + mov %rdx, (%rdi) + RETURN + + .p2align 4 +L(Fill9): + mov %rdx, (%rdi) + mov %dl, 8(%rdi) + RETURN + + .p2align 4 +L(Fill10): + mov %rdx, (%rdi) + mov %dx, 8(%rdi) + RETURN + + .p2align 4 +L(Fill11): + mov %rdx, (%rdi) + mov %edx, 7(%rdi) + RETURN + + .p2align 4 +L(Fill12): + mov %rdx, (%rdi) + mov %edx, 8(%rdi) + RETURN + + .p2align 4 +L(Fill13): + mov %rdx, (%rdi) + mov %rdx, 5(%rdi) + RETURN + + .p2align 4 +L(Fill14): + mov %rdx, (%rdi) + mov %rdx, 6(%rdi) + RETURN + + .p2align 4 +L(Fill15): + movdqu %xmm0, -1(%rdi) + RETURN + + .p2align 4 +L(Fill16): + movdqu %xmm0, (%rdi) + RETURN + + .p2align 4 +L(CopyFrom1To16BytesUnalignedXmm2): + movdqu %xmm2, (%rdi, %rcx) + + .p2align 4 +L(CopyFrom1To16BytesXmmExit): + bsf %rdx, %rdx + add $15, %r8 + add %rcx, %rdi +#ifdef USE_AS_STPCPY + lea (%rdi, %rdx), %rax +#endif + sub %rdx, %r8 + lea 1(%rdi, %rdx), %rdi + + .p2align 4 +L(StrncpyFillTailWithZero): + pxor %xmm0, %xmm0 + xor %rdx, %rdx + sub $16, %r8 + jbe L(StrncpyFillExit) + + movdqu %xmm0, (%rdi) + add $16, %rdi + + mov %rdi, %rsi + and $0xf, %rsi + sub %rsi, %rdi + add %rsi, %r8 + sub $64, %r8 + jb L(StrncpyFillLess64) + +L(StrncpyFillLoopMovdqa): + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + movdqa %xmm0, 32(%rdi) + movdqa %xmm0, 48(%rdi) + add $64, %rdi + sub $64, %r8 + jae L(StrncpyFillLoopMovdqa) + +L(StrncpyFillLess64): + add $32, %r8 + jl L(StrncpyFillLess32) + movdqa %xmm0, (%rdi) + movdqa %xmm0, 16(%rdi) + add $32, %rdi + sub $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillLess32): + add $16, %r8 + jl L(StrncpyFillExit) + movdqa %xmm0, (%rdi) + add $16, %rdi + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +L(StrncpyFillExit): + add $16, %r8 + BRANCH_TO_JMPTBL_ENTRY (L(FillTable), %r8, 4) + +/* end of ifndef USE_AS_STRCAT */ +#endif + + .p2align 4 +L(UnalignedLeaveCase2OrCase3): + test %rdx, %rdx + jnz L(Unaligned64LeaveCase2) +L(Unaligned64LeaveCase3): + lea 64(%r8), %rcx + and $-16, %rcx + add $48, %r8 + jl L(CopyFrom1To16BytesCase3) + movdqu %xmm4, (%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm5, 16(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm6, 32(%rdi) + sub $16, %r8 + jb L(CopyFrom1To16BytesCase3) + movdqu %xmm7, 48(%rdi) +#ifdef USE_AS_STPCPY + lea 64(%rdi), %rax +#endif +#ifdef USE_AS_STRCAT + xor %ch, %ch + movb %ch, 64(%rdi) +#endif + RETURN + + .p2align 4 +L(Unaligned64LeaveCase2): + xor %rcx, %rcx + pcmpeqb %xmm4, %xmm0 + pmovmskb %xmm0, %rdx + add $48, %r8 + jle L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm4) +#else + jnz L(CopyFrom1To16Bytes) +#endif + pcmpeqb %xmm5, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm4, (%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm5) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + pcmpeqb %xmm6, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm5, 16(%rdi) + add $16, %rcx + sub $16, %r8 + jbe L(CopyFrom1To16BytesCase2OrCase3) + test %rdx, %rdx +#ifndef USE_AS_STRCAT + jnz L(CopyFrom1To16BytesUnalignedXmm6) +#else + jnz L(CopyFrom1To16Bytes) +#endif + + pcmpeqb %xmm7, %xmm0 + pmovmskb %xmm0, %rdx + movdqu %xmm6, 32(%rdi) + lea 16(%rdi, %rcx), %rdi + lea 16(%rsi, %rcx), %rsi + bsf %rdx, %rdx + cmp %r8, %rdx + jb L(CopyFrom1To16BytesExit) + BRANCH_TO_JMPTBL_ENTRY (L(ExitStrncpyTable), %r8, 4) + + .p2align 4 +L(ExitZero): +#ifndef USE_AS_STRCAT + mov %rdi, %rax +#endif + RETURN + +#endif + +#ifndef USE_AS_STRCAT +END (STRCPY) +#else +END (STRCAT) +#endif + .p2align 4 + .section .rodata +L(ExitTable): + .int JMPTBL(L(Exit1), L(ExitTable)) + .int JMPTBL(L(Exit2), L(ExitTable)) + .int JMPTBL(L(Exit3), L(ExitTable)) + .int JMPTBL(L(Exit4), L(ExitTable)) + .int JMPTBL(L(Exit5), L(ExitTable)) + .int JMPTBL(L(Exit6), L(ExitTable)) + .int JMPTBL(L(Exit7), L(ExitTable)) + .int JMPTBL(L(Exit8), L(ExitTable)) + .int JMPTBL(L(Exit9), L(ExitTable)) + .int JMPTBL(L(Exit10), L(ExitTable)) + .int JMPTBL(L(Exit11), L(ExitTable)) + .int JMPTBL(L(Exit12), L(ExitTable)) + .int JMPTBL(L(Exit13), L(ExitTable)) + .int JMPTBL(L(Exit14), L(ExitTable)) + .int JMPTBL(L(Exit15), L(ExitTable)) + .int JMPTBL(L(Exit16), L(ExitTable)) + .int JMPTBL(L(Exit17), L(ExitTable)) + .int JMPTBL(L(Exit18), L(ExitTable)) + .int JMPTBL(L(Exit19), L(ExitTable)) + .int JMPTBL(L(Exit20), L(ExitTable)) + .int JMPTBL(L(Exit21), L(ExitTable)) + .int JMPTBL(L(Exit22), L(ExitTable)) + .int JMPTBL(L(Exit23), L(ExitTable)) + .int JMPTBL(L(Exit24), L(ExitTable)) + .int JMPTBL(L(Exit25), L(ExitTable)) + .int JMPTBL(L(Exit26), L(ExitTable)) + .int JMPTBL(L(Exit27), L(ExitTable)) + .int JMPTBL(L(Exit28), L(ExitTable)) + .int JMPTBL(L(Exit29), L(ExitTable)) + .int JMPTBL(L(Exit30), L(ExitTable)) + .int JMPTBL(L(Exit31), L(ExitTable)) + .int JMPTBL(L(Exit32), L(ExitTable)) +#ifdef USE_AS_STRNCPY +L(ExitStrncpyTable): + .int JMPTBL(L(StrncpyExit0), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit1), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit2), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit3), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit4), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit5), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit6), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit7), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit8), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit9), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit10), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit11), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit12), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit13), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit14), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit15), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit16), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit17), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit18), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit19), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit20), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit21), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit22), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit23), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit24), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit25), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit26), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit27), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit28), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit29), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit30), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit31), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit32), L(ExitStrncpyTable)) + .int JMPTBL(L(StrncpyExit33), L(ExitStrncpyTable)) +# ifndef USE_AS_STRCAT + .p2align 4 +L(FillTable): + .int JMPTBL(L(Fill0), L(FillTable)) + .int JMPTBL(L(Fill1), L(FillTable)) + .int JMPTBL(L(Fill2), L(FillTable)) + .int JMPTBL(L(Fill3), L(FillTable)) + .int JMPTBL(L(Fill4), L(FillTable)) + .int JMPTBL(L(Fill5), L(FillTable)) + .int JMPTBL(L(Fill6), L(FillTable)) + .int JMPTBL(L(Fill7), L(FillTable)) + .int JMPTBL(L(Fill8), L(FillTable)) + .int JMPTBL(L(Fill9), L(FillTable)) + .int JMPTBL(L(Fill10), L(FillTable)) + .int JMPTBL(L(Fill11), L(FillTable)) + .int JMPTBL(L(Fill12), L(FillTable)) + .int JMPTBL(L(Fill13), L(FillTable)) + .int JMPTBL(L(Fill14), L(FillTable)) + .int JMPTBL(L(Fill15), L(FillTable)) + .int JMPTBL(L(Fill16), L(FillTable)) +# endif +#endif diff --git a/libs/libc/machine/x86_64/gnu/arch_strlen.S b/libs/libc/machine/x86_64/gnu/arch_strlen.S new file mode 100644 index 0000000000..b0128a7d01 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strlen.S @@ -0,0 +1,305 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strlen.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +#ifndef USE_AS_STRCAT + +#ifndef STRLEN +# define STRLEN strlen +#endif + +#ifndef L +# define L(label) .L##label +#endif + +#ifndef cfi_startproc +# define cfi_startproc .cfi_startproc +#endif + +#ifndef cfi_endproc +# define cfi_endproc .cfi_endproc +#endif + +#ifndef ENTRY +# define ENTRY(name) \ + .type name, @function; \ + .globl name; \ + .p2align 4; \ +name: \ + cfi_startproc +#endif + +#ifndef END +# define END(name) \ + cfi_endproc; \ + .size name, .-name +#endif +#define RETURN ret + +/********************************************************************************* + * Public Functions + *********************************************************************************/ + + .section .text.sse2,"ax",@progbits +ENTRY (STRLEN) +/* end ifndef USE_AS_STRCAT */ +#endif + xor %rax, %rax + mov %edi, %ecx + and $0x3f, %ecx + pxor %xmm0, %xmm0 + cmp $0x30, %ecx + ja L(next) + movdqu (%rdi), %xmm1 + pcmpeqb %xmm1, %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit_less16) + mov %rdi, %rax + and $-16, %rax + jmp L(align16_start) +L(next): + mov %rdi, %rax + and $-16, %rax + pcmpeqb (%rax), %xmm0 + mov $-1, %r10d + sub %rax, %rcx + shl %cl, %r10d + pmovmskb %xmm0, %edx + and %r10d, %edx + jnz L(exit) +L(align16_start): + pxor %xmm0, %xmm0 + pxor %xmm1, %xmm1 + pxor %xmm2, %xmm2 + pxor %xmm3, %xmm3 + pcmpeqb 16(%rax), %xmm0 + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + pcmpeqb 80(%rax), %xmm0 + add $64, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit64) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 80(%rax), %xmm0 + add $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm1 + add $16, %rax + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm2 + add $16, %rax + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit) + + test $0x3f, %rax + jz L(align64_loop) + + pcmpeqb 16(%rax), %xmm3 + add $16, %rax + pmovmskb %xmm3, %edx + test %edx, %edx + jnz L(exit) + + add $16, %rax + .p2align 4 + L(align64_loop): + movaps (%rax), %xmm4 + pminub 16(%rax), %xmm4 + movaps 32(%rax), %xmm5 + pminub 48(%rax), %xmm5 + add $64, %rax + pminub %xmm4, %xmm5 + pcmpeqb %xmm0, %xmm5 + pmovmskb %xmm5, %edx + test %edx, %edx + jz L(align64_loop) + + + pcmpeqb -64(%rax), %xmm0 + sub $80, %rax + pmovmskb %xmm0, %edx + test %edx, %edx + jnz L(exit16) + + pcmpeqb 32(%rax), %xmm1 + pmovmskb %xmm1, %edx + test %edx, %edx + jnz L(exit32) + + pcmpeqb 48(%rax), %xmm2 + pmovmskb %xmm2, %edx + test %edx, %edx + jnz L(exit48) + + pcmpeqb 64(%rax), %xmm3 + pmovmskb %xmm3, %edx + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax + RETURN + + .p2align 4 +L(exit): + sub %rdi, %rax +L(exit_less16): + bsf %rdx, %rdx + add %rdx, %rax + RETURN + .p2align 4 +L(exit16): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $16, %rax + RETURN + .p2align 4 +L(exit32): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $32, %rax + RETURN + .p2align 4 +L(exit48): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $48, %rax + RETURN + .p2align 4 +L(exit64): + sub %rdi, %rax + bsf %rdx, %rdx + add %rdx, %rax + add $64, %rax +#ifndef USE_AS_STRCAT + RETURN + +END (STRLEN) +#endif diff --git a/libs/libc/machine/x86_64/gnu/arch_strncmp.S b/libs/libc/machine/x86_64/gnu/arch_strncmp.S new file mode 100644 index 0000000000..d13d9d3c72 --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strncmp.S @@ -0,0 +1,40 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strncmp.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#define USE_AS_STRNCMP +#define STRCMP strncmp +#include "arch_strcmp.S" diff --git a/libs/libc/machine/x86_64/gnu/arch_strncpy.S b/libs/libc/machine/x86_64/gnu/arch_strncpy.S new file mode 100644 index 0000000000..42ad3c09eb --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/arch_strncpy.S @@ -0,0 +1,40 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/arch_strncpy.S + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +/******************************************************************************** + * Included Files + *********************************************************************************/ + +#define USE_AS_STRNCPY +#define STRCPY strncpy +#include "arch_strcpy.S" diff --git a/libs/libc/machine/x86_64/gnu/cache.h b/libs/libc/machine/x86_64/gnu/cache.h new file mode 100644 index 0000000000..444e36f27c --- /dev/null +++ b/libs/libc/machine/x86_64/gnu/cache.h @@ -0,0 +1,49 @@ +/********************************************************************************* + * libs/libc/machine/x86_64/gnu/cache.h + * + * Copyright (c) 2014, Intel Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright notice, + * * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above copyright notice, + * * this list of conditions and the following disclaimer in the documentation + * * and/or other materials provided with the distribution. + * + * * Neither the name of Intel Corporation nor the names of its contributors + * * may be used to endorse or promote products derived from this software + * * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR + * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON + * ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + *********************************************************************************/ + +#ifndef __LIBS_LIBC_MACHINE_X86_64_GNU_CACHE_H +#define __LIBS_LIBC_MACHINE_X86_64_GNU_CACHE_H + +/********************************************************************************* + * Pre-processor Definitions + *********************************************************************************/ + +/* Values are optimized for Core Architecture */ + +#define SHARED_CACHE_SIZE (4096 * 1024) /* Core Architecture L2 Cache */ +#define DATA_CACHE_SIZE (24 * 1024) /* Core Architecture L1 Data Cache */ + +#define SHARED_CACHE_SIZE_HALF (SHARED_CACHE_SIZE / 2) +#define DATA_CACHE_SIZE_HALF (DATA_CACHE_SIZE / 2) + +#endif /* __LIBS_LIBC_MACHINE_X86_64_GNU_CACHE_H */ diff --git a/libs/libc/string/lib_stpcpy.c b/libs/libc/string/lib_stpcpy.c index 30492aca34..38095366c0 100644 --- a/libs/libc/string/lib_stpcpy.c +++ b/libs/libc/string/lib_stpcpy.c @@ -43,7 +43,7 @@ * ****************************************************************************/ -#ifndef CONFIG_ARCH_STPCPY +#ifndef CONFIG_LIBC_ARCH_STPCPY #undef stpcpy /* See mm/README.txt */ FAR char *stpcpy(FAR char *dest, FAR const char *src) {