diff --git a/libs/libc/machine/xtensa/Kconfig b/libs/libc/machine/xtensa/Kconfig index f72f3c094c..232fb73889 100644 --- a/libs/libc/machine/xtensa/Kconfig +++ b/libs/libc/machine/xtensa/Kconfig @@ -2,3 +2,46 @@ # For a description of the syntax of this configuration file, # see the file kconfig-language.txt in the NuttX tools repository. # + +config XTENSA_MEMCPY + bool "Enable optimized memcpy() for XTENSA" + select LIBC_ARCH_MEMCPY + ---help--- + Enable optimized XTENSA specific memcpy() library function + +config XTENSA_MEMMOVE + bool "Enable optimized memmove() for XTENSA" + select LIBC_ARCH_MEMMOVE + ---help--- + Enable optimized XTENSA specific memmove() library function + +config XTENSA_MEMSET + bool "Enable optimized memset() for XTENSA" + select LIBC_ARCH_MEMSET + ---help--- + Enable optimized XTENSA specific memset() library function + +config XTENSA_STRCMP + bool "Enable optimized strcmp() for XTENSA" + select LIBC_ARCH_STRCMP + ---help--- + Enable optimized XTENSA specific strcmp() library function + +config XTENSA_STRCPY + bool "Enable optimized strcpy() for XTENSA" + select LIBC_ARCH_STRCPY + ---help--- + Enable optimized XTENSA specific strcpy() library function + +config XTENSA_STRLEN + bool "Enable optimized strlen() for XTENSA" + select LIBC_ARCH_STRLEN + ---help--- + Enable optimized XTENSA specific strlen() library function + +config XTENSA_STRNCPY + bool "Enable optimized strncpy() for XTENSA" + select LIBC_ARCH_STRNCPY + ---help--- + Enable optimized XTENSA specific strncpy() library function + diff --git a/libs/libc/machine/xtensa/Make.defs b/libs/libc/machine/xtensa/Make.defs index 8f33a82488..379c7da79a 100644 --- a/libs/libc/machine/xtensa/Make.defs +++ b/libs/libc/machine/xtensa/Make.defs @@ -19,10 +19,37 @@ ############################################################################ ifeq ($(CONFIG_LIBC_ARCH_ELF),y) - CSRCS += arch_elf.c +endif + +ifeq ($(CONFIG_XTENSA_MEMCPY),y) +ASRCS += arch_memcpy.S +endif + +ifeq ($(CONFIG_XTENSA_MEMMOVE),y) +ASRCS += arch_memmove.S +endif + +ifeq ($(CONFIG_XTENSA_MEMSET),y) +ASRCS += arch_memset.S +endif + +ifeq ($(CONFIG_XTENSA_STRCPY),y) +ASRCS += arch_strcpy.S +endif + +ifeq ($(CONFIG_XTENSA_STRLEN),y) +ASRCS += arch_strlen.S +endif + +ifeq ($(CONFIG_XTENSA_STRNCPY),y) +ASRCS += arch_strncpy.S +endif + +ifeq ($(CONFIG_XTENSA_STRCMP),y) +ASRCS += arch_strcmp.S +endif DEPPATH += --dep-path machine/xtensa VPATH += :machine/xtensa -endif diff --git a/libs/libc/machine/xtensa/arch_memcpy.S b/libs/libc/machine/xtensa/arch_memcpy.S new file mode 100644 index 0000000000..47de6dd2b6 --- /dev/null +++ b/libs/libc/machine/xtensa/arch_memcpy.S @@ -0,0 +1,281 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_memcpy.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Pre-processor Macros + ****************************************************************************/ + +/* set to 1 when running on ISS (simulator) with the + lint or ferret client, or 0 to save a few cycles */ + +#define SIM_CHECKS_ALIGNMENT 0 + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + + .section .text + .begin schedule + .literal_position + + .local .Ldst1mod2 + .local .Ldst2mod4 + .local .Lbytecopy + + .align 4 + .global memcpy + .type memcpy, @function +memcpy: + ENTRY(16) + /* a2 = dst, a3 = src, a4 = len */ + + mov a5, a2 # copy dst so that a2 is return value + bbsi.l a2, 0, .Ldst1mod2 + bbsi.l a2, 1, .Ldst2mod4 +.Ldstaligned: + + /* Get number of loop iterations with 16B per iteration. */ + srli a7, a4, 4 + + /* Check if source is aligned. */ + slli a8, a3, 30 + bnez a8, .Lsrcunaligned + + /* Destination and source are word-aligned, use word copy. */ +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a8, a7, 4 + add a8, a8, a3 # a8 = end of last 16B source chunk +#endif +1: l32i a6, a3, 0 + l32i a7, a3, 4 + s32i a6, a5, 0 + l32i a6, a3, 8 + + s32i a7, a5, 4 + l32i a7, a3, 12 + s32i a6, a5, 8 + addi a3, a3, 16 + s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bltu a3, a8, 1b +#endif + + /* Copy any leftover pieces smaller than 16B. */ +2: bbci.l a4, 3, 3f + + /* Copy 8 bytes. */ + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a3, a3, 8 + s32i a6, a5, 0 + s32i a7, a5, 4 + addi a5, a5, 8 + +3: bbsi.l a4, 2, 4f + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + RET(16) + + # .align 4 + /* Copy 4 bytes. */ +4: l32i a6, a3, 0 + addi a3, a3, 4 + s32i a6, a5, 0 + addi a5, a5, 4 + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + RET(16) + + /* Copy 2 bytes. */ +5: l16ui a6, a3, 0 + addi a3, a3, 2 + s16i a6, a5, 0 + addi a5, a5, 2 + bbsi.l a4, 0, 6f + RET(16) + + /* Copy 1 byte. */ +6: l8ui a6, a3, 0 + s8i a6, a5, 0 + +.Ldone: + RET(16) + + +/* Destination is aligned; source is unaligned. */ + + # .align 4 +.Lsrcunaligned: + /* Avoid loading anything for zero-length copies. */ + beqz a4, .Ldone + + /* Copy 16 bytes per iteration for word-aligned dst and + unaligned src. */ + ssa8 a3 # set shift amount from byte offset +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + srli a11, a8, 30 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#endif +1: l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + l32i a9, a3, 12 + src_b a7, a7, a8 + s32i a7, a5, 4 + l32i a6, a3, 16 + src_b a8, a8, a9 + s32i a8, a5, 8 + addi a3, a3, 16 + src_b a9, a9, a6 + s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bltu a3, a10, 1b +#endif + +2: bbci.l a4, 3, 3f + + /* Copy 8 bytes. */ + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a3, a3, 8 + src_b a7, a7, a8 + s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 + +3: bbci.l a4, 2, 4f + + /* Copy 4 bytes. */ + l32i a7, a3, 4 + addi a3, a3, 4 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +4: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, 5f + bbsi.l a4, 0, 6f + RET(16) + + /* Copy 2 bytes. */ +5: l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + bbsi.l a4, 0, 6f + RET(16) + + /* Copy 1 byte. */ +6: l8ui a6, a3, 0 + s8i a6, a5, 0 + RET(16) + + + # .align XCHAL_INST_FETCH_WIDTH +__memcpy_aux: + + /* Skip bytes to get proper alignment for three-byte loop */ +# .skip XCHAL_INST_FETCH_WIDTH - 3 + +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, 2f +#else + beqz a4, 2f + add a7, a3, a4 # a7 = end address for source +#endif +1: l8ui a6, a3, 0 + addi a3, a3, 1 + s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + bltu a3, a7, 1b +#endif +2: RET(16) + + +/* Destination is unaligned. */ + + # .align 4 +.Ldst1mod2: # dst is only byte aligned + + /* Do short copies byte-by-byte. */ + bltui a4, 7, .Lbytecopy + + /* Copy 1 byte. */ + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + + /* Return to main algorithm if dst is now aligned. */ + bbci.l a5, 1, .Ldstaligned + +.Ldst2mod4: # dst has 16-bit alignment + + /* Do short copies byte-by-byte. */ + bltui a4, 6, .Lbytecopy + + /* Copy 2 bytes. */ + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + + /* dst is now aligned; return to main algorithm. */ + j .Ldstaligned + + .end schedule + + .size memcpy, . - memcpy diff --git a/libs/libc/machine/xtensa/arch_memmove.S b/libs/libc/machine/xtensa/arch_memmove.S new file mode 100644 index 0000000000..7ce56c47b7 --- /dev/null +++ b/libs/libc/machine/xtensa/arch_memmove.S @@ -0,0 +1,480 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_memset.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Pre-processor Macros + ****************************************************************************/ + +/* set to 1 when running on ISS (simulator) with the + lint or ferret client, or 0 to save a few cycles */ + +#define SIM_CHECKS_ALIGNMENT 0 + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + .text + .begin schedule + .global memmove + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbytecopydone + add a7, a3, a4 # a7 = end address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lnextbyte: + l8ui a6, a3, 0 + addi a3, a3, 1 + s8i a6, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbytecopydone: + RET(16) + +/* + * Destination is unaligned + */ + + .align 4 +.Ldst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbytecopy # do short copies byte by byte + + # copy 1 byte + l8ui a6, a3, 0 + addi a3, a3, 1 + addi a4, a4, -1 + s8i a6, a5, 0 + addi a5, a5, 1 + _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then + # return to main algorithm +.Ldst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbytecopy # do short copies byte by byte + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + addi a4, a4, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + j .Ldstaligned # dst is now aligned, return to main algorithm + +.Lcommon: + bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2 + bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4 +.Ldstaligned: # return here from .Ldst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + bany a3, a8, .Lsrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop1done + slli a8, a7, 4 + add a8, a8, a3 # a8 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1: + l32i a6, a3, 0 + l32i a7, a3, 4 + s32i a6, a5, 0 + l32i a6, a3, 8 + s32i a7, a5, 4 + l32i a7, a3, 12 + s32i a6, a5, 8 + addi a3, a3, 16 + s32i a7, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop1done: + bbci.l a4, 3, .L2 + # copy 8 bytes + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a3, a3, 8 + s32i a6, a5, 0 + s32i a7, a5, 4 + addi a5, a5, 8 +.L2: + bbsi.l a4, 2, .L3 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + RET(16) +.L3: + # copy 4 bytes + l32i a6, a3, 0 + addi a3, a3, 4 + s32i a6, a5, 0 + addi a5, a5, 4 + bbsi.l a4, 1, .L4 + bbsi.l a4, 0, .L5 + RET(16) +.L4: + # copy 2 bytes + l16ui a6, a3, 0 + addi a3, a3, 2 + s16i a6, a5, 0 + addi a5, a5, 2 + bbsi.l a4, 0, .L5 + RET(16) +.L5: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + RET(16) + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 +.Lsrcunaligned: + _beqz a4, .Ldone # avoid loading anything for zero-length copies + # copy 16 bytes per iteration for word-aligned dst and unaligned src + ssa8 a3 # set shift amount from byte offset + +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + and a11, a3, a8 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .Loop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .Loop2done + slli a10, a7, 4 + add a10, a10, a3 # a10 = end of last 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2: + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + l32i a9, a3, 12 + src_b a7, a7, a8 + s32i a7, a5, 4 + l32i a6, a3, 16 + src_b a8, a8, a9 + s32i a8, a5, 8 + addi a3, a3, 16 + src_b a9, a9, a6 + s32i a9, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end +#endif /* !XCHAL_HAVE_LOOPS */ +.Loop2done: + bbci.l a4, 3, .L12 + # copy 8 bytes + l32i a7, a3, 4 + l32i a8, a3, 8 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a3, a3, 8 + src_b a7, a7, a8 + s32i a7, a5, 4 + addi a5, a5, 8 + mov a6, a8 +.L12: + bbci.l a4, 2, .L13 + # copy 4 bytes + l32i a7, a3, 4 + addi a3, a3, 4 + src_b a6, a6, a7 + s32i a6, a5, 0 + addi a5, a5, 4 + mov a6, a7 +.L13: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, .L14 + bbsi.l a4, 0, .L15 +.Ldone: RET(16) +.L14: + # copy 2 bytes + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a3, a3, 2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a5, a5, 2 + bbsi.l a4, 0, .L15 + RET(16) +.L15: + # copy 1 byte + l8ui a6, a3, 0 + s8i a6, a5, 0 + RET(16) + +/* + * Byte by byte copy + */ + .align 4 + .byte 0 # 1 mod 4 alignment for LOOPNEZ + # (0 mod 4 alignment for LBEG) +.Lbackbytecopy: +#if XCHAL_HAVE_LOOPS + loopnez a4, .Lbackbytecopydone +#else /* !XCHAL_HAVE_LOOPS */ + beqz a4, .Lbackbytecopydone + sub a7, a3, a4 # a7 = start address for source +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbacknextbyte: + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a7, .Lbacknextbyte # continue loop if + # $a3:src != $a7:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.Lbackbytecopydone: + RET(16) + +/* + * Destination is unaligned + */ + + .align 4 +.Lbackdst1mod2: # dst is only byte aligned + _bltui a4, 7, .Lbackbytecopy # do short copies byte by byte + + # copy 1 byte + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 + addi a4, a4, -1 + _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then + # return to main algorithm +.Lbackdst2mod4: # dst 16-bit aligned + # copy 2 bytes + _bltui a4, 6, .Lbackbytecopy # do short copies byte by byte + addi a3, a3, -2 + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a5, a5, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + addi a4, a4, -2 + j .Lbackdstaligned # dst is now aligned, + # return to main algorithm + + .align 4 +memmove: + + ENTRY(16) + # a2/ dst, a3/ src, a4/ len + mov a5, a2 # copy dst so that a2 is return value +.Lmovecommon: + sub a6, a5, a3 + bgeu a6, a4, .Lcommon + + add a5, a5, a4 + add a3, a3, a4 + + bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2 + bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4 +.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned + srli a7, a4, 4 # number of loop iterations with 16B + # per iteration + movi a8, 3 # if source is not aligned, + bany a3, a8, .Lbacksrcunaligned # then use shifting copy + /* + * Destination and source are word-aligned, use word copy. + */ + # copy 16 bytes per iteration for word-aligned dst and word-aligned src +#if XCHAL_HAVE_LOOPS + loopnez a7, .backLoop1done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .backLoop1done + slli a8, a7, 4 + sub a8, a3, a8 # a8 = start of first 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.backLoop1: + addi a3, a3, -16 + l32i a7, a3, 12 + l32i a6, a3, 8 + addi a5, a5, -16 + s32i a7, a5, 12 + l32i a7, a3, 4 + s32i a6, a5, 8 + l32i a6, a3, 0 + s32i a7, a5, 4 + s32i a6, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.backLoop1done: + bbci.l a4, 3, .Lback2 + # copy 8 bytes + addi a3, a3, -8 + l32i a6, a3, 0 + l32i a7, a3, 4 + addi a5, a5, -8 + s32i a6, a5, 0 + s32i a7, a5, 4 +.Lback2: + bbsi.l a4, 2, .Lback3 + bbsi.l a4, 1, .Lback4 + bbsi.l a4, 0, .Lback5 + RET(16) +.Lback3: + # copy 4 bytes + addi a3, a3, -4 + l32i a6, a3, 0 + addi a5, a5, -4 + s32i a6, a5, 0 + bbsi.l a4, 1, .Lback4 + bbsi.l a4, 0, .Lback5 + RET(16) +.Lback4: + # copy 2 bytes + addi a3, a3, -2 + l16ui a6, a3, 0 + addi a5, a5, -2 + s16i a6, a5, 0 + bbsi.l a4, 0, .Lback5 + RET(16) +.Lback5: + # copy 1 byte + addi a3, a3, -1 + l8ui a6, a3, 0 + addi a5, a5, -1 + s8i a6, a5, 0 + RET(16) + +/* + * Destination is aligned, Source is unaligned + */ + + .align 4 +.Lbacksrcunaligned: + _beqz a4, .Lbackdone # avoid loading anything for zero-length copies + # copy 16 bytes per iteration for word-aligned dst and unaligned src + ssa8 a3 # set shift amount from byte offset +#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with + * the lint or ferret client, or 0 + * to save a few cycles */ +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + and a11, a3, a8 # save unalignment offset for below + sub a3, a3, a11 # align a3 +#endif + l32i a6, a3, 0 # load first word +#if XCHAL_HAVE_LOOPS + loopnez a7, .backLoop2done +#else /* !XCHAL_HAVE_LOOPS */ + beqz a7, .backLoop2done + slli a10, a7, 4 + sub a10, a3, a10 # a10 = start of first 16B source chunk +#endif /* !XCHAL_HAVE_LOOPS */ +.backLoop2: + addi a3, a3, -16 + l32i a7, a3, 12 + l32i a8, a3, 8 + addi a5, a5, -16 + src_b a6, a7, a6 + s32i a6, a5, 12 + l32i a9, a3, 4 + src_b a7, a8, a7 + s32i a7, a5, 8 + l32i a6, a3, 0 + src_b a8, a9, a8 + s32i a8, a5, 4 + src_b a9, a6, a9 + s32i a9, a5, 0 +#if !XCHAL_HAVE_LOOPS + bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start +#endif /* !XCHAL_HAVE_LOOPS */ +.backLoop2done: + bbci.l a4, 3, .Lback12 + # copy 8 bytes + addi a3, a3, -8 + l32i a7, a3, 4 + l32i a8, a3, 0 + addi a5, a5, -8 + src_b a6, a7, a6 + s32i a6, a5, 4 + src_b a7, a8, a7 + s32i a7, a5, 0 + mov a6, a8 +.Lback12: + bbci.l a4, 2, .Lback13 + # copy 4 bytes + addi a3, a3, -4 + l32i a7, a3, 0 + addi a5, a5, -4 + src_b a6, a7, a6 + s32i a6, a5, 0 + mov a6, a7 +.Lback13: +#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT + add a3, a3, a11 # readjust a3 with correct misalignment +#endif + bbsi.l a4, 1, .Lback14 + bbsi.l a4, 0, .Lback15 +.Lbackdone: + RET(16) +.Lback14: + # copy 2 bytes + addi a3, a3, -2 + l8ui a6, a3, 0 + l8ui a7, a3, 1 + addi a5, a5, -2 + s8i a6, a5, 0 + s8i a7, a5, 1 + bbsi.l a4, 0, .Lback15 + RET(16) +.Lback15: + # copy 1 byte + addi a3, a3, -1 + addi a5, a5, -1 + l8ui a6, a3, 0 + s8i a6, a5, 0 + RET(16) + + .end schedule + .size memmove, . - memmove diff --git a/libs/libc/machine/xtensa/arch_memset.S b/libs/libc/machine/xtensa/arch_memset.S new file mode 100644 index 0000000000..488172f874 --- /dev/null +++ b/libs/libc/machine/xtensa/arch_memset.S @@ -0,0 +1,179 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_memset.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + +/* void *memset (void *dst, int c, size_t length) + + The algorithm is as follows: + + Create a word with c in all byte positions. + + If the destination is aligned, set 16B chunks with a loop, and then + finish up with 8B, 4B, 2B, and 1B stores conditional on the length. + + If the destination is unaligned, align it by conditionally + setting 1B and/or 2B and then go to aligned case. + + This code tries to use fall-through branches for the common + case of an aligned destination (except for the branches to + the alignment labels). */ + + +/* Byte-by-byte set. */ + + .section .text + .begin schedule + .literal_position + + .local .Lbyteset + .local .Ldst1mod2 + .local .Ldst2mod4 + + .align 4 + .global memset + .type memset, @function +memset: + ENTRY(16) + /* a2 = dst, a3 = c, a4 = length */ + + /* Duplicate character into all bytes of word. */ + extui a3, a3, 0, 8 + slli a7, a3, 8 + or a3, a3, a7 + slli a7, a3, 16 + or a3, a3, a7 + + mov a5, a2 // copy dst so that a2 is return value + + /* Check if dst is unaligned. */ + bbsi.l a2, 0, .Ldst1mod2 + bbsi.l a2, 1, .Ldst2mod4 + j .Ldstaligned + +.Ldst1mod2: // dst is only byte aligned + + /* Do short sizes byte-by-byte. */ + bltui a4, 8, .Lbyteset + + /* Set 1 byte. */ + s8i a3, a5, 0 + addi a5, a5, 1 + addi a4, a4, -1 + + /* Now retest if dst is aligned. */ + bbci.l a5, 1, .Ldstaligned + +.Ldst2mod4: // dst has 16-bit alignment + + /* Do short sizes byte-by-byte. */ + bltui a4, 8, .Lbyteset + + /* Set 2 bytes. */ + s16i a3, a5, 0 + addi a5, a5, 2 + addi a4, a4, -2 + + /* dst is now aligned; fall through to main algorithm */ + +.Ldstaligned: + + /* Get number of loop iterations with 16B per iteration. */ + srli a7, a4, 4 + + /* Destination is word-aligned. */ +#if XCHAL_HAVE_LOOPS + loopnez a7, 2f +#else + beqz a7, 2f + slli a6, a7, 4 + add a6, a6, a5 // a6 = end of last 16B chunk +#endif + /* Set 16 bytes per iteration. */ +1: s32i a3, a5, 0 + s32i a3, a5, 4 + s32i a3, a5, 8 + s32i a3, a5, 12 + addi a5, a5, 16 +#if !XCHAL_HAVE_LOOPS + bltu a5, a6, 1b +#endif + + /* Set any leftover pieces smaller than 16B. */ +2: bbci.l a4, 3, 3f + + /* Set 8 bytes. */ + s32i a3, a5, 0 + s32i a3, a5, 4 + addi a5, a5, 8 + +3: bbci.l a4, 2, 4f + + /* Set 4 bytes. */ + s32i a3, a5, 0 + addi a5, a5, 4 + +4: bbci.l a4, 1, 5f + + /* Set 2 bytes. */ + s16i a3, a5, 0 + addi a5, a5, 2 + +5: bbci.l a4, 0, 6f + + /* Set 1 byte. */ + s8i a3, a5, 0 +6: RET(16) + + + // .align XCHAL_INST_FETCH_WIDTH +__memset_aux: + + /* Skip bytes to get proper alignment for three-byte loop */ +// .skip XCHAL_INST_FETCH_WIDTH - 3 + +.Lbyteset: +#if XCHAL_HAVE_LOOPS + loopnez a4, 2f +#else + beqz a4, 2f + add a6, a5, a4 // a6 = ending address +#endif +1: s8i a3, a5, 0 + addi a5, a5, 1 +#if !XCHAL_HAVE_LOOPS + bltu a5, a6, 1b +#endif +2: RET(16) + + .end schedule + + .size memset, . - memset diff --git a/libs/libc/machine/xtensa/arch_strcmp.S b/libs/libc/machine/xtensa/arch_strcmp.S new file mode 100644 index 0000000000..aab50bee9e --- /dev/null +++ b/libs/libc/machine/xtensa/arch_strcmp.S @@ -0,0 +1,767 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_strcmp.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Pre-processor Macros + ****************************************************************************/ + +#define MASK4 0x40404040 + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + + .section .text + .begin schedule + .align 4 + .literal_position + + .global strcmp + .type strcmp,@function + .align 4 + +strcmp: + +#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3 +/* Fast version for FLIX3 Little Endian */ + + + ENTRY(16) + /* a2 = s1, a3 = s2 */ + + l8ui a8, a2, 0 # byte 0 from s1 + l8ui a9, a3, 0 # byte 0 from s2 + movi a10, 3 # mask + movi a5, 0xfffffffc + or a11, a2, a3 + movi a4, MASK0 # mask for byte 0 + movi a7, MASK4 + addi a3, a3, -8 + addi a2, a2, -8 + and a5, a5, a2 + bne.w18 a8, a9, .Lretdiff + l32i a8, a5, 8 # get word from aligned variant of s1 + + bany.w18 a11, a10, .Lnot_aligned + +/* s1 is word-aligned; s2 is word-aligned. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. */ + +/* New algorithm, relying on the fact that all normal ASCII is between + 32 and 127. + + Rather than check all bytes for zero: + Take one word (4 bytes). Call it w1. + Shift w1 left by one into w1'. + Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. + Check that all 4 bit 6's (one for each byte) are one: + If they are, we are definitely not done. + If they are not, we are probably done, but need to check for zero. */ + +.Laligned: + /* Loop forever */ +1: + loop a0, .Laligned_done + + /* First unrolled loop body. */ + l32i a9, a3, 8 # get word from s2 + addi a3, a3, 8 # advance s2 pointer + slli a5, a8, 1 + or a10, a8, a5 + {l32i a11, a2, 12 # get word from s1+4 + bne.w18 a8, a9, .Lwne2} + l32i a9, a3, 4 # get word from s2+4 + bnall.w18 a10, a7, .Lprobeq + + /* Second unrolled loop body. */ + slli a5, a11, 1 + or a10, a11, a5 + addi a2, a2, 8 # advance s1 pointer + mov a8, a11 + bne.w18 a11, a9, .Lwne2 + l32i a8, a2, 8 # get word from s1 + bnall.w18 a10, a7, .Lprobeq2 + +.Laligned_done: + l32i a8, a2, 8 # get word from s1 + j 1b + +.Lnot_aligned: + xor a11, a2, a3 # compare low two bits of s1 and s2 + bany a11, a10, .Lunaligned # if they have different alignment + + /* s1/s2 are not word-aligned. */ + movi a5, 0xfffffffc + addi a2, a2, 1 # advance s1 + beqz a9, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + and a6, a2, a5 + l32i a8, a6, 8 # get word from s1 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 8 # byte 1 from s1 + l8ui a9, a3, 8 # byte 1 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + and a6, a2, a5 + l32i a8, a6, 8 # get word from s1 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 8 # byte 2 from s1 + l8ui a9, a3, 8 # byte 2 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + l32i a8, a2, 8 # get word from s1 + j .Laligned + +/* s1 and s2 have different alignment. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. + + Note: It is important for this unaligned case to come before the + code for aligned strings, because otherwise some of the branches + above cannot reach and have to be transformed to branches around + jumps. The unaligned code is smaller and the branches can reach + over it. */ + +.Lunaligned: + movi.n a8, 0 # set up for the maximum loop count + loop a8, .Lretdiff # loop forever (almost anyway) + l8ui a8, a2, 8 + l8ui a9, a3, 8 + addi a2, a2, 1 + bne a8, a9, .Lretdiff + addi a3, a3, 1 + beqz a8, .Lretdiff +.Lretdiff: + sub a2, a8, a9 + RET(16) + + +.Lprobeq2: + /* Adjust pointers to account for the loop unrolling. */ + mov a8, a11 + addi a2, a2, -4 + addi a3, a3, 4 + + /* align (0 mod 4) */ +.Lprobeq: + /* Words are probably equal, but check for sure. + If not, loop over the rest of string using normal algorithm. */ + + bnone a8, a4, .Leq # if byte 0 is zero + movi a5, MASK1 # mask for byte 1 + movi a6, MASK2 # mask for byte 2 + bnone a8, a5, .Leq # if byte 1 is zero + movi a7, MASK3 # mask for byte 3 + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + /* align (1 mod 4) */ + addi.n a2, a2, 12 # advance s1 pointer + addi.n a3, a3, 4 # advance s2 pointer + /* align (1 mod 4) or (2 mod 4) */ +1: + loop a0, .Lend # loop forever (a4 is bigger than max iters) + + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + addi a2, a2, 4 # advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq # if byte 0 is zero + bnone a8, a5, .Leq # if byte 1 is zero + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + addi a3, a3, 4 # advance s2 pointer +.Lend: + j 1b + + /* Words are equal; some byte is zero. */ +.Leq: movi a2, 0 # return equal + RET(16) + +.Lwne2: /* Words are not equal. On big-endian processors, if none of the + bytes are zero, the return value can be determined by a simple + comparison. */ +.Lwne: /* Words are not equal. */ + xor a2, a8, a9 # get word with nonzero in byte that differs + extui a10, a8, 0, 8 + extui a11, a9, 0, 8 + movi a5, MASK1 # mask for byte 1 + bany.w18 a2, a4, .Ldiff0 # if byte 0 differs + + bnone.w18 a8, a4, .Leq # if byte 0 is zero + movi a6, MASK2 # mask for byte 2 + bany.w18 a2, a5, .Ldiff1 # if byte 1 differs + extui a10, a8, 24, 8 + bnone.w18 a8, a5, .Leq # if byte 1 is zero + extui a11, a9, 24, 8 + bany.w18 a2, a6, .Ldiff2 # if byte 2 differs + sub a2, a10, a11 + bnone.w18 a8, a6, .Leq # if byte 2 is zero + /* Little-endian is a little more difficult because can't subtract + whole words. */ +.Ldiff3: + /* Bytes 0-2 are equal; byte 3 is different. + For little-endian need to have a sign bit for the difference. */ + RET(16) +.Ldiff0: + /* Byte 0 is different. */ + sub a2, a10, a11 + RET(16) + +.Ldiff1: + /* Byte 0 is equal; byte 1 is different. */ + extui a10, a8, 8, 8 + extui a11, a9, 8, 8 + sub a2, a10, a11 + RET(16) + +.Ldiff2: + /* Bytes 0-1 are equal; byte 2 is different. */ + extui a10, a8, 16, 8 + extui a11, a9, 16, 8 + sub a2, a10, a11 + RET(16) + +#else +#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4 +/* Fast version for FLIX3 Little Endian */ + + + ENTRY(16) + /* a2 = s1, a3 = s2 */ + + l8ui a8, a2, 0 # byte 0 from s1 + l8ui a9, a3, 0 # byte 0 from s2 + movi a10, 3 # mask + movi a5, 0xfffffffc + or a11, a2, a3 + movi a4, MASK0 # mask for byte 0 + movi a7, MASK4 + addi a3, a3, -8 + addi a2, a2, -8 + and a5, a5, a2 + bne.w15 a8, a9, .Lretdiff + l32i a8, a5, 8 # get word from aligned variant of s1 + + bany.w15 a11, a10, .Lnot_aligned + +/* s1 is word-aligned; s2 is word-aligned. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. */ + +/* New algorithm, relying on the fact that all normal ASCII is between + 32 and 127. + + Rather than check all bytes for zero: + Take one word (4 bytes). Call it w1. + Shift w1 left by one into w1'. + Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. + Check that all 4 bit 6's (one for each byte) are one: + If they are, we are definitely not done. + If they are not, we are probably done, but need to check for zero. */ + +.Laligned: + /* Loop forever */ +1: + loop a0, .Laligned_done + + /* First unrolled loop body. */ + l32i a9, a3, 8 # get word from s2 + addi a3, a3, 8 # advance s2 pointer + slli a5, a8, 1 + or a10, a8, a5 + { + bne.w15 a8, a9, .Lwne2 + l32i a11, a2, 12 # get word from s1+4 + nop + nop + } + l32i a9, a3, 4 # get word from s2+4 + bnall.w15 a10, a7, .Lprobeq + + /* Second unrolled loop body. */ + slli a5, a11, 1 + or a10, a11, a5 + addi a2, a2, 8 # advance s1 pointer + mov a8, a11 + bne.w15 a11, a9, .Lwne2 + l32i a8, a2, 8 # get word from s1 + bnall.w15 a10, a7, .Lprobeq2 + +.Laligned_done: + l32i a8, a2, 8 # get word from s1 + j 1b + +.Lnot_aligned: + xor a11, a2, a3 # compare low two bits of s1 and s2 + bany a11, a10, .Lunaligned # if they have different alignment + + /* s1/s2 are not word-aligned. */ + movi a5, 0xfffffffc + addi a2, a2, 1 # advance s1 + beqz a9, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + and a6, a2, a5 + l32i a8, a6, 8 # get word from s1 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 8 # byte 1 from s1 + l8ui a9, a3, 8 # byte 1 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + and a6, a2, a5 + l32i a8, a6, 8 # get word from s1 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 8 # byte 2 from s1 + l8ui a9, a3, 8 # byte 2 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + l32i a8, a2, 8 # get word from s1 + j .Laligned + +/* s1 and s2 have different alignment. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. + + Note: It is important for this unaligned case to come before the + code for aligned strings, because otherwise some of the branches + above cannot reach and have to be transformed to branches around + jumps. The unaligned code is smaller and the branches can reach + over it. */ + +.Lunaligned: + movi.n a8, 0 # set up for the maximum loop count + loop a8, .Lretdiff # loop forever (almost anyway) + l8ui a8, a2, 8 + l8ui a9, a3, 8 + addi a2, a2, 1 + bne a8, a9, .Lretdiff + addi a3, a3, 1 + beqz a8, .Lretdiff +.Lretdiff: + sub a2, a8, a9 + RET(16) + + +.Lprobeq2: + /* Adjust pointers to account for the loop unrolling. */ + mov a8, a11 + addi a2, a2, -4 + addi a3, a3, 4 + + /* align (0 mod 4) */ +.Lprobeq: + /* Words are probably equal, but check for sure. + If not, loop over the rest of string using normal algorithm. */ + + bnone a8, a4, .Leq # if byte 0 is zero + movi a5, MASK1 # mask for byte 1 + movi a6, MASK2 # mask for byte 2 + bnone a8, a5, .Leq # if byte 1 is zero + movi a7, MASK3 # mask for byte 3 + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + /* align (1 mod 4) */ + addi.n a2, a2, 12 # advance s1 pointer + addi.n a3, a3, 4 # advance s2 pointer + /* align (1 mod 4) or (2 mod 4) */ +1: + loop a0, .Lend # loop forever (a4 is bigger than max iters) + + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + addi a2, a2, 4 # advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq # if byte 0 is zero + bnone a8, a5, .Leq # if byte 1 is zero + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + addi a3, a3, 4 # advance s2 pointer +.Lend: + j 1b + + /* Words are equal; some byte is zero. */ +.Leq: movi a2, 0 # return equal + RET(16) + +.Lwne2: /* Words are not equal. On big-endian processors, if none of the + bytes are zero, the return value can be determined by a simple + comparison. */ +.Lwne: /* Words are not equal. */ + xor a2, a8, a9 # get word with nonzero in byte that differs + extui a10, a8, 0, 8 + extui a11, a9, 0, 8 + movi a5, MASK1 # mask for byte 1 + bany.w15 a2, a4, .Ldiff0 # if byte 0 differs + + bnone.w15 a8, a4, .Leq # if byte 0 is zero + movi a6, MASK2 # mask for byte 2 + bany.w15 a2, a5, .Ldiff1 # if byte 1 differs + extui a10, a8, 24, 8 + bnone.w15 a8, a5, .Leq # if byte 1 is zero + extui a11, a9, 24, 8 + bany.w15 a2, a6, .Ldiff2 # if byte 2 differs + sub a2, a10, a11 + bnone.w15 a8, a6, .Leq # if byte 2 is zero + /* Little-endian is a little more difficult because can't subtract + whole words. */ +.Ldiff3: + /* Bytes 0-2 are equal; byte 3 is different. + For little-endian need to have a sign bit for the difference. */ + RET(16) +.Ldiff0: + /* Byte 0 is different. */ + sub a2, a10, a11 + RET(16) + +.Ldiff1: + /* Byte 0 is equal; byte 1 is different. */ + extui a10, a8, 8, 8 + extui a11, a9, 8, 8 + sub a2, a10, a11 + RET(16) + +.Ldiff2: + /* Bytes 0-1 are equal; byte 2 is different. */ + extui a10, a8, 16, 8 + extui a11, a9, 16, 8 + sub a2, a10, a11 + RET(16) + + +#else /* Not FLIX3 */ + ENTRY(16) + /* a2 = s1, a3 = s2 */ + + l8ui a8, a2, 0 # byte 0 from s1 + l8ui a9, a3, 0 # byte 0 from s2 + movi a10, 3 # mask + bne a8, a9, .Lretdiff + + or a11, a2, a3 + bnone a11, a10, .Laligned + + xor a11, a2, a3 # compare low two bits of s1 and s2 + bany a11, a10, .Lunaligned # if they have different alignment + + /* s1/s2 are not word-aligned. */ + addi a2, a2, 1 # advance s1 + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 0 # byte 1 from s1 + l8ui a9, a3, 0 # byte 1 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + bnone a2, a10, .Laligned # if s1/s2 now aligned + l8ui a8, a2, 0 # byte 2 from s1 + l8ui a9, a3, 0 # byte 2 from s2 + addi a2, a2, 1 # advance s1 + bne a8, a9, .Lretdiff # if different, return difference + beqz a8, .Leq # bytes equal, if zero, strings are equal + addi a3, a3, 1 # advance s2 + j .Laligned + +/* s1 and s2 have different alignment. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. + + Note: It is important for this unaligned case to come before the + code for aligned strings, because otherwise some of the branches + above cannot reach and have to be transformed to branches around + jumps. The unaligned code is smaller and the branches can reach + over it. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +#endif +.Lunaligned: +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, .Lretdiff # loop forever (almost anyway) +#endif +.Lnextbyte: + l8ui a8, a2, 0 + l8ui a9, a3, 0 + addi a2, a2, 1 + bne a8, a9, .Lretdiff + addi a3, a3, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, .Lretdiff +#else + bnez a8, .Lnextbyte +#endif +.Lretdiff: + sub a2, a8, a9 + RET(16) + +/* s1 is word-aligned; s2 is word-aligned. + + If the zero-overhead loop option is available, use an (almost) + infinite zero-overhead loop with conditional exits so we only pay + for taken branches when exiting the loop. */ + +/* New algorithm, relying on the fact that all normal ASCII is between + 32 and 127. + + Rather than check all bytes for zero: + Take one word (4 bytes). Call it w1. + Shift w1 left by one into w1'. + Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't. + Check that all 4 bit 6's (one for each byte) are one: + If they are, we are definitely not done. + If they are not, we are probably done, but need to check for zero. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_CONST16 + /* (2 mod 4) alignment for loop instruction */ + .byte 0 +#endif +.Laligned: + movi a4, MASK0 # mask for byte 0 + movi a7, MASK4 + + /* Loop forever */ +1: + loop a0, .Laligned_done + + /* First unrolled loop body. */ + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq + + /* Second unrolled loop body. */ + l32i a8, a2, 4 # get word from s1+4 + l32i a9, a3, 4 # get word from s2+4 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + bnall a9, a7, .Lprobeq2 + + addi a2, a2, 8 # advance s1 pointer + addi a3, a3, 8 # advance s2 pointer +.Laligned_done: + j 1b + +.Lprobeq2: + /* Adjust pointers to account for the loop unrolling. */ + addi a2, a2, 4 + addi a3, a3, 4 + +#else /* !XCHAL_HAVE_LOOPS */ + +.Laligned: + movi a4, MASK0 # mask for byte 0 + movi a7, MASK4 + j .Lfirstword +.Lnextword: + addi a2, a2, 4 # advance s1 pointer + addi a3, a3, 4 # advance s2 pointer +.Lfirstword: + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + slli a5, a8, 1 + bne a8, a9, .Lwne2 + or a9, a8, a5 + ball a9, a7, .Lnextword +#endif /* !XCHAL_HAVE_LOOPS */ + + /* align (0 mod 4) */ +.Lprobeq: + /* Words are probably equal, but check for sure. + If not, loop over the rest of string using normal algorithm. */ + + bnone a8, a4, .Leq # if byte 0 is zero + movi a5, MASK1 # mask for byte 1 + movi a6, MASK2 # mask for byte 2 + bnone a8, a5, .Leq # if byte 1 is zero + movi a7, MASK3 # mask for byte 3 + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + /* align (1 mod 4) */ +#if XCHAL_HAVE_DENSITY + addi.n a2, a2, 4 # advance s1 pointer + addi.n a3, a3, 4 # advance s2 pointer + /* align (1 mod 4) or (2 mod 4) */ +#else + addi a2, a2, 4 # advance s1 pointer + addi a3, a3, 4 # advance s2 pointer + or a1, a1, a1 # nop +#if XCHAL_HAVE_CONST16 + or a1, a1, a1 # nop +#endif + /* align (2 mod 4) */ +#endif /* XCHAL_HAVE_DENSITY */ +#if XCHAL_HAVE_LOOPS +1: + loop a0, .Leq # loop forever (a4 is bigger than max iters) + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + addi a2, a2, 4 # advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq # if byte 0 is zero + bnone a8, a5, .Leq # if byte 1 is zero + bnone a8, a6, .Leq # if byte 2 is zero + bnone a8, a7, .Leq # if byte 3 is zero + addi a3, a3, 4 # advance s2 pointer + j 1b +#else /* !XCHAL_HAVE_LOOPS */ + + j .Lfirstword2 +.Lnextword2: + addi a3, a3, 4 # advance s2 pointer +.Lfirstword2: + l32i a8, a2, 0 # get word from s1 + l32i a9, a3, 0 # get word from s2 + addi a2, a2, 4 # advance s1 pointer + bne a8, a9, .Lwne + bnone a8, a4, .Leq # if byte 0 is zero + bnone a8, a5, .Leq # if byte 1 is zero + bnone a8, a6, .Leq # if byte 2 is zero + bany a8, a7, .Lnextword2 # if byte 3 is zero +#endif /* !XCHAL_HAVE_LOOPS */ + + /* Words are equal; some byte is zero. */ +.Leq: movi a2, 0 # return equal + RET(16) + +.Lwne2: /* Words are not equal. On big-endian processors, if none of the + bytes are zero, the return value can be determined by a simple + comparison. */ +#if XCHAL_HAVE_BE + or a10, a8, a5 + bnall a10, a7, .Lsomezero + bgeu a8, a9, .Lposreturn + movi a2, -1 + RET(16) +.Lposreturn: + movi a2, 1 + RET(16) +.Lsomezero: # There is probably some zero byte. +#endif /* XCHAL_HAVE_BE */ +.Lwne: /* Words are not equal. */ + xor a2, a8, a9 # get word with nonzero in byte that differs + bany a2, a4, .Ldiff0 # if byte 0 differs + movi a5, MASK1 # mask for byte 1 + bnone a8, a4, .Leq # if byte 0 is zero + bany a2, a5, .Ldiff1 # if byte 1 differs + movi a6, MASK2 # mask for byte 2 + bnone a8, a5, .Leq # if byte 1 is zero + bany a2, a6, .Ldiff2 # if byte 2 differs + bnone a8, a6, .Leq # if byte 2 is zero +#if XCHAL_HAVE_BE +.Ldiff3: +.Ldiff2: +.Ldiff1: + /* Byte 0 is equal (at least) and there is a difference before a zero + byte. Just subtract words to get the return value. + The high order equal bytes cancel, leaving room for the sign. */ + sub a2, a8, a9 + RET(16) + +.Ldiff0: + /* Need to make room for the sign, so can't subtract whole words. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + RET(16) + +#else /* !XCHAL_HAVE_BE */ + /* Little-endian is a little more difficult because can't subtract + whole words. */ +.Ldiff3: + /* Bytes 0-2 are equal; byte 3 is different. + For little-endian need to have a sign bit for the difference. */ + extui a10, a8, 24, 8 + extui a11, a9, 24, 8 + sub a2, a10, a11 + RET(16) + +.Ldiff0: + /* Byte 0 is different. */ + extui a10, a8, 0, 8 + extui a11, a9, 0, 8 + sub a2, a10, a11 + RET(16) + +.Ldiff1: + /* Byte 0 is equal; byte 1 is different. */ + extui a10, a8, 8, 8 + extui a11, a9, 8, 8 + sub a2, a10, a11 + RET(16) + +.Ldiff2: + /* Bytes 0-1 are equal; byte 2 is different. */ + extui a10, a8, 16, 8 + extui a11, a9, 16, 8 + sub a2, a10, a11 + RET(16) + +#endif /* !XCHAL_HAVE_BE */ +#endif /* FLIX3 */ +#endif /* FLIX3 */ + + .end schedule + .size strcmp, . - strcmp + diff --git a/libs/libc/machine/xtensa/arch_strcpy.S b/libs/libc/machine/xtensa/arch_strcpy.S new file mode 100644 index 0000000000..b062d87e37 --- /dev/null +++ b/libs/libc/machine/xtensa/arch_strcpy.S @@ -0,0 +1,243 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_strcpy.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + + .section .text + .begin schedule + .align 4 + .literal_position + .global strcpy + .type strcpy, @function +strcpy: + ENTRY(16) + /* a2 = dst, a3 = src */ + + mov a10, a2 # leave dst in return value register + movi a4, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a3, 0, .Lsrc1mod2 + bbsi.l a3, 1, .Lsrc2mod4 +.Lsrcaligned: + + /* Check if the destination is aligned. */ + movi a8, 3 + bnone a10, a8, .Laligned + + j .Ldstunaligned + +.Lsrc1mod2: # src address is odd + l8ui a8, a3, 0 # get byte 0 + addi a3, a3, 1 # advance src pointer + s8i a8, a10, 0 # store byte 0 + beqz a8, 1f # if byte 0 is zero + addi a10, a10, 1 # advance dst pointer + bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned + +.Lsrc2mod4: # src address is 2 mod 4 + l8ui a8, a3, 0 # get byte 0 + /* 1-cycle interlock */ + s8i a8, a10, 0 # store byte 0 + beqz a8, 1f # if byte 0 is zero + l8ui a8, a3, 1 # get byte 0 + addi a3, a3, 2 # advance src pointer + s8i a8, a10, 1 # store byte 0 + addi a10, a10, 2 # advance dst pointer + bnez a8, .Lsrcaligned +1: RET(16) + + +/* dst is word-aligned; src is word-aligned. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +.Laligned: +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, .Lz3 # loop forever (almost anyway) + l32i a8, a3, 0 # get word from src + addi a3, a3, 4 # advance src pointer + bnone a8, a4, .Lz0 # if byte 0 is zero + bnone a8, a5, .Lz1 # if byte 1 is zero + bnone a8, a6, .Lz2 # if byte 2 is zero + s32i a8, a10, 0 # store word to dst + bnone a8, a7, .Lz3 # if byte 3 is zero + addi a10, a10, 4 # advance dst pointer + +#else /* !XCHAL_HAVE_LOOPS */ + +1: addi a10, a10, 4 # advance dst pointer +.Laligned: + l32i a8, a3, 0 # get word from src + addi a3, a3, 4 # advance src pointer + bnone a8, a4, .Lz0 # if byte 0 is zero + bnone a8, a5, .Lz1 # if byte 1 is zero + bnone a8, a6, .Lz2 # if byte 2 is zero + s32i a8, a10, 0 # store word to dst + bany a8, a7, 1b # if byte 3 is zero +#endif /* !XCHAL_HAVE_LOOPS */ + +.Lz3: /* Byte 3 is zero. */ + RET(16) + +.Lz0: /* Byte 0 is zero. */ +#if XCHAL_HAVE_BE + movi a8, 0 +#endif + s8i a8, a10, 0 + RET(16) + +.Lz1: /* Byte 1 is zero. */ +#if XCHAL_HAVE_BE + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + RET(16) + +.Lz2: /* Byte 2 is zero. */ +#if XCHAL_HAVE_BE + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + movi a8, 0 + s8i a8, a10, 2 + RET(16) + +#if 1 +/* For now just use byte copy loop for the unaligned destination case. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +#endif +.Ldstunaligned: + +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, 2f # loop forever (almost anyway) +#endif +1: l8ui a8, a3, 0 + addi a3, a3, 1 + s8i a8, a10, 0 + addi a10, a10, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, 2f +#else + bnez a8, 1b +#endif +2: RET(16) + +#else /* 0 */ + +/* This code is not functional yet. */ + +.Ldstunaligned: + l32i a9, a2, 0 # load word from dst +#if XCHAL_HAVE_BE + ssa8b a9 # rotate by dst alignment so that + src a9, a9, a9 # shift in loop will put back in place + ssa8l a9 # shift left by byte*8 +#else + ssa8l a9 # rotate by dst alignment so that + src a9, a9, a9 # shift in loop will put back in place + ssa8b a9 # shift left by 32-byte*8 +#endif + +/* dst is word-aligned; src is unaligned. */ + +.Ldstunalignedloop: + l32i a8, a3, 0 # get word from src + /* 1-cycle interlock */ + bnone a8, a4, .Lu0 # if byte 0 is zero + bnone a8, a5, .Lu1 # if byte 1 is zero + bnone a8, a6, .Lu2 # if byte 2 is zero + src a9, a8, a9 # combine last word and this word + s32i a9, a10, 0 # store word to dst + bnone a8, a7, .Lu3 # if byte 3 is nonzero, iterate + l32i a9, a3, 4 # get word from src + addi a3, a3, 8 # advance src pointer + bnone a9, a4, .Lu4 # if byte 0 is zero + bnone a9, a5, .Lu5 # if byte 1 is zero + bnone a9, a6, .Lu6 # if byte 2 is zero + src a8, a9, a8 # combine last word and this word + s32i a8, a10, 4 # store word to dst + addi a10, a10, 8 # advance dst pointer + bany a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate + + /* Byte 7 is zero. */ +.Lu7: RET(16) + +.Lu0: /* Byte 0 is zero. */ +#if XCHAL_HAVE_BE + movi a8, 0 +#endif + s8i a8, a10, 0 + RET(16) + +.Lu1: /* Byte 1 is zero. */ +#if XCHAL_HAVE_BE + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + RET(16) + +.Lu2: /* Byte 2 is zero. */ + s16i a8, a10, 0 + movi a8, 0 + s8i a8, a10, 2 + RET(16) + +#endif /* 0 */ + .end schedule + + .size strcpy, . - strcpy diff --git a/libs/libc/machine/xtensa/arch_strlen.S b/libs/libc/machine/xtensa/arch_strlen.S new file mode 100644 index 0000000000..686268e5cb --- /dev/null +++ b/libs/libc/machine/xtensa/arch_strlen.S @@ -0,0 +1,123 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_strlen.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + + .section .text + .begin schedule + .align 4 + .literal_position + .global strlen + .type strlen, @function +strlen: + ENTRY(16) + /* a2 = s */ + + addi a3, a2, -4 # because we overincrement at the end + movi a4, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a2, 0, .L1mod2 + bbsi.l a2, 1, .L2mod4 + j .Laligned + +.L1mod2: # address is odd + l8ui a8, a3, 4 # get byte 0 + addi a3, a3, 1 # advance string pointer + beqz a8, .Lz3 # if byte 0 is zero + bbci.l a3, 1, .Laligned # if string pointer is now word-aligned + +.L2mod4: # address is 2 mod 4 + addi a3, a3, 2 # advance ptr for aligned access + l32i a8, a3, 0 # get word with first two bytes of string + bnone a8, a6, .Lz2 # if byte 2 (of word, not string) is zero + bany a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero + + /* Byte 3 is zero. */ + addi a3, a3, 3 # point to zero byte + sub a2, a3, a2 # subtract to get length + RET(16) + + +/* String is word-aligned. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +#endif +.Laligned: +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, .Lz3 # loop forever (almost anyway) +#endif +1: l32i a8, a3, 4 # get next word of string + addi a3, a3, 4 # advance string pointer + bnone a8, a4, .Lz0 # if byte 0 is zero + bnone a8, a5, .Lz1 # if byte 1 is zero + bnone a8, a6, .Lz2 # if byte 2 is zero +#if XCHAL_HAVE_LOOPS + bnone a8, a7, .Lz3 # if byte 3 is zero +#else + bany a8, a7, 1b # repeat if byte 3 is non-zero +#endif + +.Lz3: /* Byte 3 is zero. */ + addi a3, a3, 3 # point to zero byte + /* Fall through.... */ + +.Lz0: /* Byte 0 is zero. */ + sub a2, a3, a2 # subtract to get length + RET(16) + +.Lz1: /* Byte 1 is zero. */ + addi a3, a3, 1 # point to zero byte + sub a2, a3, a2 # subtract to get length + RET(16) + +.Lz2: /* Byte 2 is zero. */ + addi a3, a3, 2 # point to zero byte + sub a2, a3, a2 # subtract to get length + RET(16) + + .end schedule + + .size strlen, . - strlen diff --git a/libs/libc/machine/xtensa/arch_strncpy.S b/libs/libc/machine/xtensa/arch_strncpy.S new file mode 100644 index 0000000000..297f00c781 --- /dev/null +++ b/libs/libc/machine/xtensa/arch_strncpy.S @@ -0,0 +1,265 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/arch_strncpy.S + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include "xtensa_asm.h" + +#include +#include + +/**************************************************************************** + * Public Functions + ****************************************************************************/ + + .section .text +.begin schedule + .align 4 + .literal_position +__strncpy_aux: + +.Lsrc1mod2: # src address is odd + l8ui a8, a3, 0 # get byte 0 + addi a3, a3, 1 # advance src pointer + s8i a8, a10, 0 # store byte 0 + addi a4, a4, -1 # decrement n + beqz a4, .Lret # if n is zero + addi a10, a10, 1 # advance dst pointer + beqz a8, .Lfill # if byte 0 is zero + bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned + +.Lsrc2mod4: # src address is 2 mod 4 + l8ui a8, a3, 0 # get byte 0 + addi a4, a4, -1 # decrement n + s8i a8, a10, 0 # store byte 0 + beqz a4, .Lret # if n is zero + addi a10, a10, 1 # advance dst pointer + beqz a8, .Lfill # if byte 0 is zero + l8ui a8, a3, 1 # get byte 0 + addi a3, a3, 2 # advance src pointer + s8i a8, a10, 0 # store byte 0 + addi a4, a4, -1 # decrement n + beqz a4, .Lret # if n is zero + addi a10, a10, 1 # advance dst pointer + bnez a8, .Lsrcaligned + j .Lfill + +.Lret: + RET(16) + + .align 4 + .global strncpy + .type strncpy, @function +strncpy: + ENTRY(16) + /* a2 = dst, a3 = src */ + + mov a10, a2 # leave dst in return value register + beqz a4, .Lret # if n is zero + + movi a11, MASK0 + movi a5, MASK1 + movi a6, MASK2 + movi a7, MASK3 + bbsi.l a3, 0, .Lsrc1mod2 + bbsi.l a3, 1, .Lsrc2mod4 +.Lsrcaligned: + + /* Check if the destination is aligned. */ + movi a8, 3 + bnone a10, a8, .Laligned + + j .Ldstunaligned + + +/* Fill the dst with zeros -- n is at least 1. */ + +.Lfill: + movi a9, 0 + bbsi.l a10, 0, .Lfill1mod2 + bbsi.l a10, 1, .Lfill2mod4 +.Lfillaligned: + blti a4, 4, .Lfillcleanup + + /* Loop filling complete words with zero. */ +#if XCHAL_HAVE_LOOPS + + srai a8, a4, 2 + loop a8, 1f + s32i a9, a10, 0 + addi a10, a10, 4 + +1: slli a8, a8, 2 + sub a4, a4, a8 + +#else /* !XCHAL_HAVE_LOOPS */ + +1: s32i a9, a10, 0 + addi a10, a10, 4 + addi a4, a4, -4 + bgei a4, 4, 1b + +#endif /* !XCHAL_HAVE_LOOPS */ + + beqz a4, 2f + +.Lfillcleanup: + /* Fill leftover (1 to 3) bytes with zero. */ + s8i a9, a10, 0 # store byte 0 + addi a4, a4, -1 # decrement n + addi a10, a10, 1 + bnez a4, .Lfillcleanup + +2: RET(16) + +.Lfill1mod2: # dst address is odd + s8i a9, a10, 0 # store byte 0 + addi a4, a4, -1 # decrement n + beqz a4, 2b # if n is zero + addi a10, a10, 1 # advance dst pointer + bbci.l a10, 1, .Lfillaligned # if dst is now word-aligned + +.Lfill2mod4: # dst address is 2 mod 4 + s8i a9, a10, 0 # store byte 0 + addi a4, a4, -1 # decrement n + beqz a4, 2b # if n is zero + s8i a9, a10, 1 # store byte 1 + addi a4, a4, -1 # decrement n + beqz a4, 2b # if n is zero + addi a10, a10, 2 # advance dst pointer + j .Lfillaligned + + +/* dst is word-aligned; src is word-aligned; n is at least 1. */ + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +#endif +.Laligned: +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, 1f # loop forever (almost anyway) + blti a4, 5, .Ldstunaligned # n is near limit; do one at a time + l32i a8, a3, 0 # get word from src + addi a3, a3, 4 # advance src pointer + bnone a8, a11, .Lz0 # if byte 0 is zero + bnone a8, a5, .Lz1 # if byte 1 is zero + bnone a8, a6, .Lz2 # if byte 2 is zero + s32i a8, a10, 0 # store word to dst + addi a4, a4, -4 # decrement n + addi a10, a10, 4 # advance dst pointer + bnone a8, a7, .Lfill # if byte 3 is zero +1: + +#else /* !XCHAL_HAVE_LOOPS */ + +1: blti a4, 5, .Ldstunaligned # n is near limit; do one at a time + l32i a8, a3, 0 # get word from src + addi a3, a3, 4 # advance src pointer + bnone a8, a11, .Lz0 # if byte 0 is zero + bnone a8, a5, .Lz1 # if byte 1 is zero + bnone a8, a6, .Lz2 # if byte 2 is zero + s32i a8, a10, 0 # store word to dst + addi a4, a4, -4 # decrement n + addi a10, a10, 4 # advance dst pointer + bany a8, a7, 1b # no zeroes +#endif /* !XCHAL_HAVE_LOOPS */ + + j .Lfill + +.Lz0: /* Byte 0 is zero. */ +#if XCHAL_HAVE_BE + movi a8, 0 +#endif + s8i a8, a10, 0 + addi a4, a4, -1 # decrement n + addi a10, a10, 1 # advance dst pointer + j .Lfill + +.Lz1: /* Byte 1 is zero. */ +#if XCHAL_HAVE_BE + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + addi a4, a4, -2 # decrement n + addi a10, a10, 2 # advance dst pointer + j .Lfill + +.Lz2: /* Byte 2 is zero. */ +#if XCHAL_HAVE_BE + extui a8, a8, 16, 16 +#endif + s16i a8, a10, 0 + movi a8, 0 + s8i a8, a10, 2 + addi a4, a4, -3 # decrement n + addi a10, a10, 3 # advance dst pointer + j .Lfill + + .align 4 +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + /* (2 mod 4) alignment for loop instruction */ +#else + /* (1 mod 4) alignment for loop instruction */ + .byte 0 + .byte 0 +#endif +#endif +.Ldstunaligned: + +#if XCHAL_HAVE_LOOPS +#if XCHAL_HAVE_DENSITY + _movi.n a8, 0 # set up for the maximum loop count +#else + _movi a8, 0 # set up for the maximum loop count +#endif + loop a8, 2f # loop forever (almost anyway) +#endif +1: l8ui a8, a3, 0 + addi a3, a3, 1 + s8i a8, a10, 0 + addi a4, a4, -1 + beqz a4, 3f + addi a10, a10, 1 +#if XCHAL_HAVE_LOOPS + beqz a8, 2f +#else + bnez a8, 1b +#endif +2: j .Lfill + +3: RET(16) +.end schedule + + .size strncpy, . - strncpy diff --git a/libs/libc/machine/xtensa/xtensa_asm.h b/libs/libc/machine/xtensa/xtensa_asm.h new file mode 100644 index 0000000000..99137635c5 --- /dev/null +++ b/libs/libc/machine/xtensa/xtensa_asm.h @@ -0,0 +1,62 @@ +/**************************************************************************** + * libs/libc/machine/xtensa/xtensa_asm.h + * + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. The + * ASF licenses this file to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance with the + * License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations + * under the License. + * + ****************************************************************************/ + +/**************************************************************************** + * Included Files + ****************************************************************************/ + +#include + +/**************************************************************************** + * Assembly Language Macros + ****************************************************************************/ + + .macro src_b r, w0, w1 +#if XCHAL_HAVE_BE + src \r, \w0, \w1 +#else + src \r, \w1, \w0 +#endif + .endm + + .macro ssa8 r +#if XCHAL_HAVE_BE + ssa8b \r +#else + ssa8l \r +#endif + .endm + +/**************************************************************************** + * Pre-processor Macros + ****************************************************************************/ + +#if XCHAL_HAVE_BE +# define MASK0 0xff000000 +# define MASK1 0x00ff0000 +# define MASK2 0x0000ff00 +# define MASK3 0x000000ff +#else +# define MASK0 0x000000ff +# define MASK1 0x0000ff00 +# define MASK2 0x00ff0000 +# define MASK3 0xff000000 +#endif +