libc:machine:xtensa:add xtensa libc implement

N/A

Signed-off-by: zhuyanlin <zhuyanlin1@xiaomi.com>
This commit is contained in:
zhuyanlin 2021-10-28 11:56:18 +08:00 committed by Xiang Xiao
parent 580d17cc02
commit cfcff5f570
10 changed files with 2472 additions and 2 deletions

View File

@ -2,3 +2,46 @@
# For a description of the syntax of this configuration file,
# see the file kconfig-language.txt in the NuttX tools repository.
#
config XTENSA_MEMCPY
bool "Enable optimized memcpy() for XTENSA"
select LIBC_ARCH_MEMCPY
---help---
Enable optimized XTENSA specific memcpy() library function
config XTENSA_MEMMOVE
bool "Enable optimized memmove() for XTENSA"
select LIBC_ARCH_MEMMOVE
---help---
Enable optimized XTENSA specific memmove() library function
config XTENSA_MEMSET
bool "Enable optimized memset() for XTENSA"
select LIBC_ARCH_MEMSET
---help---
Enable optimized XTENSA specific memset() library function
config XTENSA_STRCMP
bool "Enable optimized strcmp() for XTENSA"
select LIBC_ARCH_STRCMP
---help---
Enable optimized XTENSA specific strcmp() library function
config XTENSA_STRCPY
bool "Enable optimized strcpy() for XTENSA"
select LIBC_ARCH_STRCPY
---help---
Enable optimized XTENSA specific strcpy() library function
config XTENSA_STRLEN
bool "Enable optimized strlen() for XTENSA"
select LIBC_ARCH_STRLEN
---help---
Enable optimized XTENSA specific strlen() library function
config XTENSA_STRNCPY
bool "Enable optimized strncpy() for XTENSA"
select LIBC_ARCH_STRNCPY
---help---
Enable optimized XTENSA specific strncpy() library function

View File

@ -19,10 +19,37 @@
############################################################################
ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
CSRCS += arch_elf.c
endif
ifeq ($(CONFIG_XTENSA_MEMCPY),y)
ASRCS += arch_memcpy.S
endif
ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
ASRCS += arch_memmove.S
endif
ifeq ($(CONFIG_XTENSA_MEMSET),y)
ASRCS += arch_memset.S
endif
ifeq ($(CONFIG_XTENSA_STRCPY),y)
ASRCS += arch_strcpy.S
endif
ifeq ($(CONFIG_XTENSA_STRLEN),y)
ASRCS += arch_strlen.S
endif
ifeq ($(CONFIG_XTENSA_STRNCPY),y)
ASRCS += arch_strncpy.S
endif
ifeq ($(CONFIG_XTENSA_STRCMP),y)
ASRCS += arch_strcmp.S
endif
DEPPATH += --dep-path machine/xtensa
VPATH += :machine/xtensa
endif

View File

@ -0,0 +1,281 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_memcpy.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Pre-processor Macros
****************************************************************************/
/* set to 1 when running on ISS (simulator) with the
lint or ferret client, or 0 to save a few cycles */
#define SIM_CHECKS_ALIGNMENT 0
/****************************************************************************
* Public Functions
****************************************************************************/
.section .text
.begin schedule
.literal_position
.local .Ldst1mod2
.local .Ldst2mod4
.local .Lbytecopy
.align 4
.global memcpy
.type memcpy, @function
memcpy:
ENTRY(16)
/* a2 = dst, a3 = src, a4 = len */
mov a5, a2 # copy dst so that a2 is return value
bbsi.l a2, 0, .Ldst1mod2
bbsi.l a2, 1, .Ldst2mod4
.Ldstaligned:
/* Get number of loop iterations with 16B per iteration. */
srli a7, a4, 4
/* Check if source is aligned. */
slli a8, a3, 30
bnez a8, .Lsrcunaligned
/* Destination and source are word-aligned, use word copy. */
#if XCHAL_HAVE_LOOPS
loopnez a7, 2f
#else
beqz a7, 2f
slli a8, a7, 4
add a8, a8, a3 # a8 = end of last 16B source chunk
#endif
1: l32i a6, a3, 0
l32i a7, a3, 4
s32i a6, a5, 0
l32i a6, a3, 8
s32i a7, a5, 4
l32i a7, a3, 12
s32i a6, a5, 8
addi a3, a3, 16
s32i a7, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bltu a3, a8, 1b
#endif
/* Copy any leftover pieces smaller than 16B. */
2: bbci.l a4, 3, 3f
/* Copy 8 bytes. */
l32i a6, a3, 0
l32i a7, a3, 4
addi a3, a3, 8
s32i a6, a5, 0
s32i a7, a5, 4
addi a5, a5, 8
3: bbsi.l a4, 2, 4f
bbsi.l a4, 1, 5f
bbsi.l a4, 0, 6f
RET(16)
# .align 4
/* Copy 4 bytes. */
4: l32i a6, a3, 0
addi a3, a3, 4
s32i a6, a5, 0
addi a5, a5, 4
bbsi.l a4, 1, 5f
bbsi.l a4, 0, 6f
RET(16)
/* Copy 2 bytes. */
5: l16ui a6, a3, 0
addi a3, a3, 2
s16i a6, a5, 0
addi a5, a5, 2
bbsi.l a4, 0, 6f
RET(16)
/* Copy 1 byte. */
6: l8ui a6, a3, 0
s8i a6, a5, 0
.Ldone:
RET(16)
/* Destination is aligned; source is unaligned. */
# .align 4
.Lsrcunaligned:
/* Avoid loading anything for zero-length copies. */
beqz a4, .Ldone
/* Copy 16 bytes per iteration for word-aligned dst and
unaligned src. */
ssa8 a3 # set shift amount from byte offset
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
srli a11, a8, 30 # save unalignment offset for below
sub a3, a3, a11 # align a3
#endif
l32i a6, a3, 0 # load first word
#if XCHAL_HAVE_LOOPS
loopnez a7, 2f
#else
beqz a7, 2f
slli a10, a7, 4
add a10, a10, a3 # a10 = end of last 16B source chunk
#endif
1: l32i a7, a3, 4
l32i a8, a3, 8
src_b a6, a6, a7
s32i a6, a5, 0
l32i a9, a3, 12
src_b a7, a7, a8
s32i a7, a5, 4
l32i a6, a3, 16
src_b a8, a8, a9
s32i a8, a5, 8
addi a3, a3, 16
src_b a9, a9, a6
s32i a9, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bltu a3, a10, 1b
#endif
2: bbci.l a4, 3, 3f
/* Copy 8 bytes. */
l32i a7, a3, 4
l32i a8, a3, 8
src_b a6, a6, a7
s32i a6, a5, 0
addi a3, a3, 8
src_b a7, a7, a8
s32i a7, a5, 4
addi a5, a5, 8
mov a6, a8
3: bbci.l a4, 2, 4f
/* Copy 4 bytes. */
l32i a7, a3, 4
addi a3, a3, 4
src_b a6, a6, a7
s32i a6, a5, 0
addi a5, a5, 4
mov a6, a7
4:
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
add a3, a3, a11 # readjust a3 with correct misalignment
#endif
bbsi.l a4, 1, 5f
bbsi.l a4, 0, 6f
RET(16)
/* Copy 2 bytes. */
5: l8ui a6, a3, 0
l8ui a7, a3, 1
addi a3, a3, 2
s8i a6, a5, 0
s8i a7, a5, 1
addi a5, a5, 2
bbsi.l a4, 0, 6f
RET(16)
/* Copy 1 byte. */
6: l8ui a6, a3, 0
s8i a6, a5, 0
RET(16)
# .align XCHAL_INST_FETCH_WIDTH
__memcpy_aux:
/* Skip bytes to get proper alignment for three-byte loop */
# .skip XCHAL_INST_FETCH_WIDTH - 3
.Lbytecopy:
#if XCHAL_HAVE_LOOPS
loopnez a4, 2f
#else
beqz a4, 2f
add a7, a3, a4 # a7 = end address for source
#endif
1: l8ui a6, a3, 0
addi a3, a3, 1
s8i a6, a5, 0
addi a5, a5, 1
#if !XCHAL_HAVE_LOOPS
bltu a3, a7, 1b
#endif
2: RET(16)
/* Destination is unaligned. */
# .align 4
.Ldst1mod2: # dst is only byte aligned
/* Do short copies byte-by-byte. */
bltui a4, 7, .Lbytecopy
/* Copy 1 byte. */
l8ui a6, a3, 0
addi a3, a3, 1
addi a4, a4, -1
s8i a6, a5, 0
addi a5, a5, 1
/* Return to main algorithm if dst is now aligned. */
bbci.l a5, 1, .Ldstaligned
.Ldst2mod4: # dst has 16-bit alignment
/* Do short copies byte-by-byte. */
bltui a4, 6, .Lbytecopy
/* Copy 2 bytes. */
l8ui a6, a3, 0
l8ui a7, a3, 1
addi a3, a3, 2
addi a4, a4, -2
s8i a6, a5, 0
s8i a7, a5, 1
addi a5, a5, 2
/* dst is now aligned; return to main algorithm. */
j .Ldstaligned
.end schedule
.size memcpy, . - memcpy

View File

@ -0,0 +1,480 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_memset.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Pre-processor Macros
****************************************************************************/
/* set to 1 when running on ISS (simulator) with the
lint or ferret client, or 0 to save a few cycles */
#define SIM_CHECKS_ALIGNMENT 0
/****************************************************************************
* Public Functions
****************************************************************************/
.text
.begin schedule
.global memmove
/*
* Byte by byte copy
*/
.align 4
.byte 0 # 1 mod 4 alignment for LOOPNEZ
# (0 mod 4 alignment for LBEG)
.Lbytecopy:
#if XCHAL_HAVE_LOOPS
loopnez a4, .Lbytecopydone
#else /* !XCHAL_HAVE_LOOPS */
beqz a4, .Lbytecopydone
add a7, a3, a4 # a7 = end address for source
#endif /* !XCHAL_HAVE_LOOPS */
.Lnextbyte:
l8ui a6, a3, 0
addi a3, a3, 1
s8i a6, a5, 0
addi a5, a5, 1
#if !XCHAL_HAVE_LOOPS
bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
#endif /* !XCHAL_HAVE_LOOPS */
.Lbytecopydone:
RET(16)
/*
* Destination is unaligned
*/
.align 4
.Ldst1mod2: # dst is only byte aligned
_bltui a4, 7, .Lbytecopy # do short copies byte by byte
# copy 1 byte
l8ui a6, a3, 0
addi a3, a3, 1
addi a4, a4, -1
s8i a6, a5, 0
addi a5, a5, 1
_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
# return to main algorithm
.Ldst2mod4: # dst 16-bit aligned
# copy 2 bytes
_bltui a4, 6, .Lbytecopy # do short copies byte by byte
l8ui a6, a3, 0
l8ui a7, a3, 1
addi a3, a3, 2
addi a4, a4, -2
s8i a6, a5, 0
s8i a7, a5, 1
addi a5, a5, 2
j .Ldstaligned # dst is now aligned, return to main algorithm
.Lcommon:
bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
srli a7, a4, 4 # number of loop iterations with 16B
# per iteration
movi a8, 3 # if source is not aligned,
bany a3, a8, .Lsrcunaligned # then use shifting copy
/*
* Destination and source are word-aligned, use word copy.
*/
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
#if XCHAL_HAVE_LOOPS
loopnez a7, .Loop1done
#else /* !XCHAL_HAVE_LOOPS */
beqz a7, .Loop1done
slli a8, a7, 4
add a8, a8, a3 # a8 = end of last 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1:
l32i a6, a3, 0
l32i a7, a3, 4
s32i a6, a5, 0
l32i a6, a3, 8
s32i a7, a5, 4
l32i a7, a3, 12
s32i a6, a5, 8
addi a3, a3, 16
s32i a7, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
#endif /* !XCHAL_HAVE_LOOPS */
.Loop1done:
bbci.l a4, 3, .L2
# copy 8 bytes
l32i a6, a3, 0
l32i a7, a3, 4
addi a3, a3, 8
s32i a6, a5, 0
s32i a7, a5, 4
addi a5, a5, 8
.L2:
bbsi.l a4, 2, .L3
bbsi.l a4, 1, .L4
bbsi.l a4, 0, .L5
RET(16)
.L3:
# copy 4 bytes
l32i a6, a3, 0
addi a3, a3, 4
s32i a6, a5, 0
addi a5, a5, 4
bbsi.l a4, 1, .L4
bbsi.l a4, 0, .L5
RET(16)
.L4:
# copy 2 bytes
l16ui a6, a3, 0
addi a3, a3, 2
s16i a6, a5, 0
addi a5, a5, 2
bbsi.l a4, 0, .L5
RET(16)
.L5:
# copy 1 byte
l8ui a6, a3, 0
s8i a6, a5, 0
RET(16)
/*
* Destination is aligned, Source is unaligned
*/
.align 4
.Lsrcunaligned:
_beqz a4, .Ldone # avoid loading anything for zero-length copies
# copy 16 bytes per iteration for word-aligned dst and unaligned src
ssa8 a3 # set shift amount from byte offset
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
and a11, a3, a8 # save unalignment offset for below
sub a3, a3, a11 # align a3
#endif
l32i a6, a3, 0 # load first word
#if XCHAL_HAVE_LOOPS
loopnez a7, .Loop2done
#else /* !XCHAL_HAVE_LOOPS */
beqz a7, .Loop2done
slli a10, a7, 4
add a10, a10, a3 # a10 = end of last 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.Loop2:
l32i a7, a3, 4
l32i a8, a3, 8
src_b a6, a6, a7
s32i a6, a5, 0
l32i a9, a3, 12
src_b a7, a7, a8
s32i a7, a5, 4
l32i a6, a3, 16
src_b a8, a8, a9
s32i a8, a5, 8
addi a3, a3, 16
src_b a9, a9, a6
s32i a9, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
#endif /* !XCHAL_HAVE_LOOPS */
.Loop2done:
bbci.l a4, 3, .L12
# copy 8 bytes
l32i a7, a3, 4
l32i a8, a3, 8
src_b a6, a6, a7
s32i a6, a5, 0
addi a3, a3, 8
src_b a7, a7, a8
s32i a7, a5, 4
addi a5, a5, 8
mov a6, a8
.L12:
bbci.l a4, 2, .L13
# copy 4 bytes
l32i a7, a3, 4
addi a3, a3, 4
src_b a6, a6, a7
s32i a6, a5, 0
addi a5, a5, 4
mov a6, a7
.L13:
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
add a3, a3, a11 # readjust a3 with correct misalignment
#endif
bbsi.l a4, 1, .L14
bbsi.l a4, 0, .L15
.Ldone: RET(16)
.L14:
# copy 2 bytes
l8ui a6, a3, 0
l8ui a7, a3, 1
addi a3, a3, 2
s8i a6, a5, 0
s8i a7, a5, 1
addi a5, a5, 2
bbsi.l a4, 0, .L15
RET(16)
.L15:
# copy 1 byte
l8ui a6, a3, 0
s8i a6, a5, 0
RET(16)
/*
* Byte by byte copy
*/
.align 4
.byte 0 # 1 mod 4 alignment for LOOPNEZ
# (0 mod 4 alignment for LBEG)
.Lbackbytecopy:
#if XCHAL_HAVE_LOOPS
loopnez a4, .Lbackbytecopydone
#else /* !XCHAL_HAVE_LOOPS */
beqz a4, .Lbackbytecopydone
sub a7, a3, a4 # a7 = start address for source
#endif /* !XCHAL_HAVE_LOOPS */
.Lbacknextbyte:
addi a3, a3, -1
l8ui a6, a3, 0
addi a5, a5, -1
s8i a6, a5, 0
#if !XCHAL_HAVE_LOOPS
bne a3, a7, .Lbacknextbyte # continue loop if
# $a3:src != $a7:src_start
#endif /* !XCHAL_HAVE_LOOPS */
.Lbackbytecopydone:
RET(16)
/*
* Destination is unaligned
*/
.align 4
.Lbackdst1mod2: # dst is only byte aligned
_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
# copy 1 byte
addi a3, a3, -1
l8ui a6, a3, 0
addi a5, a5, -1
s8i a6, a5, 0
addi a4, a4, -1
_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
# return to main algorithm
.Lbackdst2mod4: # dst 16-bit aligned
# copy 2 bytes
_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
addi a3, a3, -2
l8ui a6, a3, 0
l8ui a7, a3, 1
addi a5, a5, -2
s8i a6, a5, 0
s8i a7, a5, 1
addi a4, a4, -2
j .Lbackdstaligned # dst is now aligned,
# return to main algorithm
.align 4
memmove:
ENTRY(16)
# a2/ dst, a3/ src, a4/ len
mov a5, a2 # copy dst so that a2 is return value
.Lmovecommon:
sub a6, a5, a3
bgeu a6, a4, .Lcommon
add a5, a5, a4
add a3, a3, a4
bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
srli a7, a4, 4 # number of loop iterations with 16B
# per iteration
movi a8, 3 # if source is not aligned,
bany a3, a8, .Lbacksrcunaligned # then use shifting copy
/*
* Destination and source are word-aligned, use word copy.
*/
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
#if XCHAL_HAVE_LOOPS
loopnez a7, .backLoop1done
#else /* !XCHAL_HAVE_LOOPS */
beqz a7, .backLoop1done
slli a8, a7, 4
sub a8, a3, a8 # a8 = start of first 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.backLoop1:
addi a3, a3, -16
l32i a7, a3, 12
l32i a6, a3, 8
addi a5, a5, -16
s32i a7, a5, 12
l32i a7, a3, 4
s32i a6, a5, 8
l32i a6, a3, 0
s32i a7, a5, 4
s32i a6, a5, 0
#if !XCHAL_HAVE_LOOPS
bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
#endif /* !XCHAL_HAVE_LOOPS */
.backLoop1done:
bbci.l a4, 3, .Lback2
# copy 8 bytes
addi a3, a3, -8
l32i a6, a3, 0
l32i a7, a3, 4
addi a5, a5, -8
s32i a6, a5, 0
s32i a7, a5, 4
.Lback2:
bbsi.l a4, 2, .Lback3
bbsi.l a4, 1, .Lback4
bbsi.l a4, 0, .Lback5
RET(16)
.Lback3:
# copy 4 bytes
addi a3, a3, -4
l32i a6, a3, 0
addi a5, a5, -4
s32i a6, a5, 0
bbsi.l a4, 1, .Lback4
bbsi.l a4, 0, .Lback5
RET(16)
.Lback4:
# copy 2 bytes
addi a3, a3, -2
l16ui a6, a3, 0
addi a5, a5, -2
s16i a6, a5, 0
bbsi.l a4, 0, .Lback5
RET(16)
.Lback5:
# copy 1 byte
addi a3, a3, -1
l8ui a6, a3, 0
addi a5, a5, -1
s8i a6, a5, 0
RET(16)
/*
* Destination is aligned, Source is unaligned
*/
.align 4
.Lbacksrcunaligned:
_beqz a4, .Lbackdone # avoid loading anything for zero-length copies
# copy 16 bytes per iteration for word-aligned dst and unaligned src
ssa8 a3 # set shift amount from byte offset
#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
* the lint or ferret client, or 0
* to save a few cycles */
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
and a11, a3, a8 # save unalignment offset for below
sub a3, a3, a11 # align a3
#endif
l32i a6, a3, 0 # load first word
#if XCHAL_HAVE_LOOPS
loopnez a7, .backLoop2done
#else /* !XCHAL_HAVE_LOOPS */
beqz a7, .backLoop2done
slli a10, a7, 4
sub a10, a3, a10 # a10 = start of first 16B source chunk
#endif /* !XCHAL_HAVE_LOOPS */
.backLoop2:
addi a3, a3, -16
l32i a7, a3, 12
l32i a8, a3, 8
addi a5, a5, -16
src_b a6, a7, a6
s32i a6, a5, 12
l32i a9, a3, 4
src_b a7, a8, a7
s32i a7, a5, 8
l32i a6, a3, 0
src_b a8, a9, a8
s32i a8, a5, 4
src_b a9, a6, a9
s32i a9, a5, 0
#if !XCHAL_HAVE_LOOPS
bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
#endif /* !XCHAL_HAVE_LOOPS */
.backLoop2done:
bbci.l a4, 3, .Lback12
# copy 8 bytes
addi a3, a3, -8
l32i a7, a3, 4
l32i a8, a3, 0
addi a5, a5, -8
src_b a6, a7, a6
s32i a6, a5, 4
src_b a7, a8, a7
s32i a7, a5, 0
mov a6, a8
.Lback12:
bbci.l a4, 2, .Lback13
# copy 4 bytes
addi a3, a3, -4
l32i a7, a3, 0
addi a5, a5, -4
src_b a6, a7, a6
s32i a6, a5, 0
mov a6, a7
.Lback13:
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
add a3, a3, a11 # readjust a3 with correct misalignment
#endif
bbsi.l a4, 1, .Lback14
bbsi.l a4, 0, .Lback15
.Lbackdone:
RET(16)
.Lback14:
# copy 2 bytes
addi a3, a3, -2
l8ui a6, a3, 0
l8ui a7, a3, 1
addi a5, a5, -2
s8i a6, a5, 0
s8i a7, a5, 1
bbsi.l a4, 0, .Lback15
RET(16)
.Lback15:
# copy 1 byte
addi a3, a3, -1
addi a5, a5, -1
l8ui a6, a3, 0
s8i a6, a5, 0
RET(16)
.end schedule
.size memmove, . - memmove

View File

@ -0,0 +1,179 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_memset.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Public Functions
****************************************************************************/
/* void *memset (void *dst, int c, size_t length)
The algorithm is as follows:
Create a word with c in all byte positions.
If the destination is aligned, set 16B chunks with a loop, and then
finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
If the destination is unaligned, align it by conditionally
setting 1B and/or 2B and then go to aligned case.
This code tries to use fall-through branches for the common
case of an aligned destination (except for the branches to
the alignment labels). */
/* Byte-by-byte set. */
.section .text
.begin schedule
.literal_position
.local .Lbyteset
.local .Ldst1mod2
.local .Ldst2mod4
.align 4
.global memset
.type memset, @function
memset:
ENTRY(16)
/* a2 = dst, a3 = c, a4 = length */
/* Duplicate character into all bytes of word. */
extui a3, a3, 0, 8
slli a7, a3, 8
or a3, a3, a7
slli a7, a3, 16
or a3, a3, a7
mov a5, a2 // copy dst so that a2 is return value
/* Check if dst is unaligned. */
bbsi.l a2, 0, .Ldst1mod2
bbsi.l a2, 1, .Ldst2mod4
j .Ldstaligned
.Ldst1mod2: // dst is only byte aligned
/* Do short sizes byte-by-byte. */
bltui a4, 8, .Lbyteset
/* Set 1 byte. */
s8i a3, a5, 0
addi a5, a5, 1
addi a4, a4, -1
/* Now retest if dst is aligned. */
bbci.l a5, 1, .Ldstaligned
.Ldst2mod4: // dst has 16-bit alignment
/* Do short sizes byte-by-byte. */
bltui a4, 8, .Lbyteset
/* Set 2 bytes. */
s16i a3, a5, 0
addi a5, a5, 2
addi a4, a4, -2
/* dst is now aligned; fall through to main algorithm */
.Ldstaligned:
/* Get number of loop iterations with 16B per iteration. */
srli a7, a4, 4
/* Destination is word-aligned. */
#if XCHAL_HAVE_LOOPS
loopnez a7, 2f
#else
beqz a7, 2f
slli a6, a7, 4
add a6, a6, a5 // a6 = end of last 16B chunk
#endif
/* Set 16 bytes per iteration. */
1: s32i a3, a5, 0
s32i a3, a5, 4
s32i a3, a5, 8
s32i a3, a5, 12
addi a5, a5, 16
#if !XCHAL_HAVE_LOOPS
bltu a5, a6, 1b
#endif
/* Set any leftover pieces smaller than 16B. */
2: bbci.l a4, 3, 3f
/* Set 8 bytes. */
s32i a3, a5, 0
s32i a3, a5, 4
addi a5, a5, 8
3: bbci.l a4, 2, 4f
/* Set 4 bytes. */
s32i a3, a5, 0
addi a5, a5, 4
4: bbci.l a4, 1, 5f
/* Set 2 bytes. */
s16i a3, a5, 0
addi a5, a5, 2
5: bbci.l a4, 0, 6f
/* Set 1 byte. */
s8i a3, a5, 0
6: RET(16)
// .align XCHAL_INST_FETCH_WIDTH
__memset_aux:
/* Skip bytes to get proper alignment for three-byte loop */
// .skip XCHAL_INST_FETCH_WIDTH - 3
.Lbyteset:
#if XCHAL_HAVE_LOOPS
loopnez a4, 2f
#else
beqz a4, 2f
add a6, a5, a4 // a6 = ending address
#endif
1: s8i a3, a5, 0
addi a5, a5, 1
#if !XCHAL_HAVE_LOOPS
bltu a5, a6, 1b
#endif
2: RET(16)
.end schedule
.size memset, . - memset

View File

@ -0,0 +1,767 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_strcmp.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Pre-processor Macros
****************************************************************************/
#define MASK4 0x40404040
/****************************************************************************
* Public Functions
****************************************************************************/
.section .text
.begin schedule
.align 4
.literal_position
.global strcmp
.type strcmp,@function
.align 4
strcmp:
#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3
/* Fast version for FLIX3 Little Endian */
ENTRY(16)
/* a2 = s1, a3 = s2 */
l8ui a8, a2, 0 # byte 0 from s1
l8ui a9, a3, 0 # byte 0 from s2
movi a10, 3 # mask
movi a5, 0xfffffffc
or a11, a2, a3
movi a4, MASK0 # mask for byte 0
movi a7, MASK4
addi a3, a3, -8
addi a2, a2, -8
and a5, a5, a2
bne.w18 a8, a9, .Lretdiff
l32i a8, a5, 8 # get word from aligned variant of s1
bany.w18 a11, a10, .Lnot_aligned
/* s1 is word-aligned; s2 is word-aligned.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop. */
/* New algorithm, relying on the fact that all normal ASCII is between
32 and 127.
Rather than check all bytes for zero:
Take one word (4 bytes). Call it w1.
Shift w1 left by one into w1'.
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
Check that all 4 bit 6's (one for each byte) are one:
If they are, we are definitely not done.
If they are not, we are probably done, but need to check for zero. */
.Laligned:
/* Loop forever */
1:
loop a0, .Laligned_done
/* First unrolled loop body. */
l32i a9, a3, 8 # get word from s2
addi a3, a3, 8 # advance s2 pointer
slli a5, a8, 1
or a10, a8, a5
{l32i a11, a2, 12 # get word from s1+4
bne.w18 a8, a9, .Lwne2}
l32i a9, a3, 4 # get word from s2+4
bnall.w18 a10, a7, .Lprobeq
/* Second unrolled loop body. */
slli a5, a11, 1
or a10, a11, a5
addi a2, a2, 8 # advance s1 pointer
mov a8, a11
bne.w18 a11, a9, .Lwne2
l32i a8, a2, 8 # get word from s1
bnall.w18 a10, a7, .Lprobeq2
.Laligned_done:
l32i a8, a2, 8 # get word from s1
j 1b
.Lnot_aligned:
xor a11, a2, a3 # compare low two bits of s1 and s2
bany a11, a10, .Lunaligned # if they have different alignment
/* s1/s2 are not word-aligned. */
movi a5, 0xfffffffc
addi a2, a2, 1 # advance s1
beqz a9, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
and a6, a2, a5
l32i a8, a6, 8 # get word from s1
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 8 # byte 1 from s1
l8ui a9, a3, 8 # byte 1 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
and a6, a2, a5
l32i a8, a6, 8 # get word from s1
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 8 # byte 2 from s1
l8ui a9, a3, 8 # byte 2 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
l32i a8, a2, 8 # get word from s1
j .Laligned
/* s1 and s2 have different alignment.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop.
Note: It is important for this unaligned case to come before the
code for aligned strings, because otherwise some of the branches
above cannot reach and have to be transformed to branches around
jumps. The unaligned code is smaller and the branches can reach
over it. */
.Lunaligned:
movi.n a8, 0 # set up for the maximum loop count
loop a8, .Lretdiff # loop forever (almost anyway)
l8ui a8, a2, 8
l8ui a9, a3, 8
addi a2, a2, 1
bne a8, a9, .Lretdiff
addi a3, a3, 1
beqz a8, .Lretdiff
.Lretdiff:
sub a2, a8, a9
RET(16)
.Lprobeq2:
/* Adjust pointers to account for the loop unrolling. */
mov a8, a11
addi a2, a2, -4
addi a3, a3, 4
/* align (0 mod 4) */
.Lprobeq:
/* Words are probably equal, but check for sure.
If not, loop over the rest of string using normal algorithm. */
bnone a8, a4, .Leq # if byte 0 is zero
movi a5, MASK1 # mask for byte 1
movi a6, MASK2 # mask for byte 2
bnone a8, a5, .Leq # if byte 1 is zero
movi a7, MASK3 # mask for byte 3
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
/* align (1 mod 4) */
addi.n a2, a2, 12 # advance s1 pointer
addi.n a3, a3, 4 # advance s2 pointer
/* align (1 mod 4) or (2 mod 4) */
1:
loop a0, .Lend # loop forever (a4 is bigger than max iters)
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
addi a2, a2, 4 # advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq # if byte 0 is zero
bnone a8, a5, .Leq # if byte 1 is zero
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
addi a3, a3, 4 # advance s2 pointer
.Lend:
j 1b
/* Words are equal; some byte is zero. */
.Leq: movi a2, 0 # return equal
RET(16)
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
bytes are zero, the return value can be determined by a simple
comparison. */
.Lwne: /* Words are not equal. */
xor a2, a8, a9 # get word with nonzero in byte that differs
extui a10, a8, 0, 8
extui a11, a9, 0, 8
movi a5, MASK1 # mask for byte 1
bany.w18 a2, a4, .Ldiff0 # if byte 0 differs
bnone.w18 a8, a4, .Leq # if byte 0 is zero
movi a6, MASK2 # mask for byte 2
bany.w18 a2, a5, .Ldiff1 # if byte 1 differs
extui a10, a8, 24, 8
bnone.w18 a8, a5, .Leq # if byte 1 is zero
extui a11, a9, 24, 8
bany.w18 a2, a6, .Ldiff2 # if byte 2 differs
sub a2, a10, a11
bnone.w18 a8, a6, .Leq # if byte 2 is zero
/* Little-endian is a little more difficult because can't subtract
whole words. */
.Ldiff3:
/* Bytes 0-2 are equal; byte 3 is different.
For little-endian need to have a sign bit for the difference. */
RET(16)
.Ldiff0:
/* Byte 0 is different. */
sub a2, a10, a11
RET(16)
.Ldiff1:
/* Byte 0 is equal; byte 1 is different. */
extui a10, a8, 8, 8
extui a11, a9, 8, 8
sub a2, a10, a11
RET(16)
.Ldiff2:
/* Bytes 0-1 are equal; byte 2 is different. */
extui a10, a8, 16, 8
extui a11, a9, 16, 8
sub a2, a10, a11
RET(16)
#else
#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4
/* Fast version for FLIX3 Little Endian */
ENTRY(16)
/* a2 = s1, a3 = s2 */
l8ui a8, a2, 0 # byte 0 from s1
l8ui a9, a3, 0 # byte 0 from s2
movi a10, 3 # mask
movi a5, 0xfffffffc
or a11, a2, a3
movi a4, MASK0 # mask for byte 0
movi a7, MASK4
addi a3, a3, -8
addi a2, a2, -8
and a5, a5, a2
bne.w15 a8, a9, .Lretdiff
l32i a8, a5, 8 # get word from aligned variant of s1
bany.w15 a11, a10, .Lnot_aligned
/* s1 is word-aligned; s2 is word-aligned.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop. */
/* New algorithm, relying on the fact that all normal ASCII is between
32 and 127.
Rather than check all bytes for zero:
Take one word (4 bytes). Call it w1.
Shift w1 left by one into w1'.
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
Check that all 4 bit 6's (one for each byte) are one:
If they are, we are definitely not done.
If they are not, we are probably done, but need to check for zero. */
.Laligned:
/* Loop forever */
1:
loop a0, .Laligned_done
/* First unrolled loop body. */
l32i a9, a3, 8 # get word from s2
addi a3, a3, 8 # advance s2 pointer
slli a5, a8, 1
or a10, a8, a5
{
bne.w15 a8, a9, .Lwne2
l32i a11, a2, 12 # get word from s1+4
nop
nop
}
l32i a9, a3, 4 # get word from s2+4
bnall.w15 a10, a7, .Lprobeq
/* Second unrolled loop body. */
slli a5, a11, 1
or a10, a11, a5
addi a2, a2, 8 # advance s1 pointer
mov a8, a11
bne.w15 a11, a9, .Lwne2
l32i a8, a2, 8 # get word from s1
bnall.w15 a10, a7, .Lprobeq2
.Laligned_done:
l32i a8, a2, 8 # get word from s1
j 1b
.Lnot_aligned:
xor a11, a2, a3 # compare low two bits of s1 and s2
bany a11, a10, .Lunaligned # if they have different alignment
/* s1/s2 are not word-aligned. */
movi a5, 0xfffffffc
addi a2, a2, 1 # advance s1
beqz a9, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
and a6, a2, a5
l32i a8, a6, 8 # get word from s1
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 8 # byte 1 from s1
l8ui a9, a3, 8 # byte 1 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
and a6, a2, a5
l32i a8, a6, 8 # get word from s1
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 8 # byte 2 from s1
l8ui a9, a3, 8 # byte 2 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
l32i a8, a2, 8 # get word from s1
j .Laligned
/* s1 and s2 have different alignment.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop.
Note: It is important for this unaligned case to come before the
code for aligned strings, because otherwise some of the branches
above cannot reach and have to be transformed to branches around
jumps. The unaligned code is smaller and the branches can reach
over it. */
.Lunaligned:
movi.n a8, 0 # set up for the maximum loop count
loop a8, .Lretdiff # loop forever (almost anyway)
l8ui a8, a2, 8
l8ui a9, a3, 8
addi a2, a2, 1
bne a8, a9, .Lretdiff
addi a3, a3, 1
beqz a8, .Lretdiff
.Lretdiff:
sub a2, a8, a9
RET(16)
.Lprobeq2:
/* Adjust pointers to account for the loop unrolling. */
mov a8, a11
addi a2, a2, -4
addi a3, a3, 4
/* align (0 mod 4) */
.Lprobeq:
/* Words are probably equal, but check for sure.
If not, loop over the rest of string using normal algorithm. */
bnone a8, a4, .Leq # if byte 0 is zero
movi a5, MASK1 # mask for byte 1
movi a6, MASK2 # mask for byte 2
bnone a8, a5, .Leq # if byte 1 is zero
movi a7, MASK3 # mask for byte 3
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
/* align (1 mod 4) */
addi.n a2, a2, 12 # advance s1 pointer
addi.n a3, a3, 4 # advance s2 pointer
/* align (1 mod 4) or (2 mod 4) */
1:
loop a0, .Lend # loop forever (a4 is bigger than max iters)
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
addi a2, a2, 4 # advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq # if byte 0 is zero
bnone a8, a5, .Leq # if byte 1 is zero
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
addi a3, a3, 4 # advance s2 pointer
.Lend:
j 1b
/* Words are equal; some byte is zero. */
.Leq: movi a2, 0 # return equal
RET(16)
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
bytes are zero, the return value can be determined by a simple
comparison. */
.Lwne: /* Words are not equal. */
xor a2, a8, a9 # get word with nonzero in byte that differs
extui a10, a8, 0, 8
extui a11, a9, 0, 8
movi a5, MASK1 # mask for byte 1
bany.w15 a2, a4, .Ldiff0 # if byte 0 differs
bnone.w15 a8, a4, .Leq # if byte 0 is zero
movi a6, MASK2 # mask for byte 2
bany.w15 a2, a5, .Ldiff1 # if byte 1 differs
extui a10, a8, 24, 8
bnone.w15 a8, a5, .Leq # if byte 1 is zero
extui a11, a9, 24, 8
bany.w15 a2, a6, .Ldiff2 # if byte 2 differs
sub a2, a10, a11
bnone.w15 a8, a6, .Leq # if byte 2 is zero
/* Little-endian is a little more difficult because can't subtract
whole words. */
.Ldiff3:
/* Bytes 0-2 are equal; byte 3 is different.
For little-endian need to have a sign bit for the difference. */
RET(16)
.Ldiff0:
/* Byte 0 is different. */
sub a2, a10, a11
RET(16)
.Ldiff1:
/* Byte 0 is equal; byte 1 is different. */
extui a10, a8, 8, 8
extui a11, a9, 8, 8
sub a2, a10, a11
RET(16)
.Ldiff2:
/* Bytes 0-1 are equal; byte 2 is different. */
extui a10, a8, 16, 8
extui a11, a9, 16, 8
sub a2, a10, a11
RET(16)
#else /* Not FLIX3 */
ENTRY(16)
/* a2 = s1, a3 = s2 */
l8ui a8, a2, 0 # byte 0 from s1
l8ui a9, a3, 0 # byte 0 from s2
movi a10, 3 # mask
bne a8, a9, .Lretdiff
or a11, a2, a3
bnone a11, a10, .Laligned
xor a11, a2, a3 # compare low two bits of s1 and s2
bany a11, a10, .Lunaligned # if they have different alignment
/* s1/s2 are not word-aligned. */
addi a2, a2, 1 # advance s1
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 0 # byte 1 from s1
l8ui a9, a3, 0 # byte 1 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
bnone a2, a10, .Laligned # if s1/s2 now aligned
l8ui a8, a2, 0 # byte 2 from s1
l8ui a9, a3, 0 # byte 2 from s2
addi a2, a2, 1 # advance s1
bne a8, a9, .Lretdiff # if different, return difference
beqz a8, .Leq # bytes equal, if zero, strings are equal
addi a3, a3, 1 # advance s2
j .Laligned
/* s1 and s2 have different alignment.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop.
Note: It is important for this unaligned case to come before the
code for aligned strings, because otherwise some of the branches
above cannot reach and have to be transformed to branches around
jumps. The unaligned code is smaller and the branches can reach
over it. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Lunaligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, .Lretdiff # loop forever (almost anyway)
#endif
.Lnextbyte:
l8ui a8, a2, 0
l8ui a9, a3, 0
addi a2, a2, 1
bne a8, a9, .Lretdiff
addi a3, a3, 1
#if XCHAL_HAVE_LOOPS
beqz a8, .Lretdiff
#else
bnez a8, .Lnextbyte
#endif
.Lretdiff:
sub a2, a8, a9
RET(16)
/* s1 is word-aligned; s2 is word-aligned.
If the zero-overhead loop option is available, use an (almost)
infinite zero-overhead loop with conditional exits so we only pay
for taken branches when exiting the loop. */
/* New algorithm, relying on the fact that all normal ASCII is between
32 and 127.
Rather than check all bytes for zero:
Take one word (4 bytes). Call it w1.
Shift w1 left by one into w1'.
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
Check that all 4 bit 6's (one for each byte) are one:
If they are, we are definitely not done.
If they are not, we are probably done, but need to check for zero. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_CONST16
/* (2 mod 4) alignment for loop instruction */
.byte 0
#endif
.Laligned:
movi a4, MASK0 # mask for byte 0
movi a7, MASK4
/* Loop forever */
1:
loop a0, .Laligned_done
/* First unrolled loop body. */
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
bnall a9, a7, .Lprobeq
/* Second unrolled loop body. */
l32i a8, a2, 4 # get word from s1+4
l32i a9, a3, 4 # get word from s2+4
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
bnall a9, a7, .Lprobeq2
addi a2, a2, 8 # advance s1 pointer
addi a3, a3, 8 # advance s2 pointer
.Laligned_done:
j 1b
.Lprobeq2:
/* Adjust pointers to account for the loop unrolling. */
addi a2, a2, 4
addi a3, a3, 4
#else /* !XCHAL_HAVE_LOOPS */
.Laligned:
movi a4, MASK0 # mask for byte 0
movi a7, MASK4
j .Lfirstword
.Lnextword:
addi a2, a2, 4 # advance s1 pointer
addi a3, a3, 4 # advance s2 pointer
.Lfirstword:
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
slli a5, a8, 1
bne a8, a9, .Lwne2
or a9, a8, a5
ball a9, a7, .Lnextword
#endif /* !XCHAL_HAVE_LOOPS */
/* align (0 mod 4) */
.Lprobeq:
/* Words are probably equal, but check for sure.
If not, loop over the rest of string using normal algorithm. */
bnone a8, a4, .Leq # if byte 0 is zero
movi a5, MASK1 # mask for byte 1
movi a6, MASK2 # mask for byte 2
bnone a8, a5, .Leq # if byte 1 is zero
movi a7, MASK3 # mask for byte 3
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
/* align (1 mod 4) */
#if XCHAL_HAVE_DENSITY
addi.n a2, a2, 4 # advance s1 pointer
addi.n a3, a3, 4 # advance s2 pointer
/* align (1 mod 4) or (2 mod 4) */
#else
addi a2, a2, 4 # advance s1 pointer
addi a3, a3, 4 # advance s2 pointer
or a1, a1, a1 # nop
#if XCHAL_HAVE_CONST16
or a1, a1, a1 # nop
#endif
/* align (2 mod 4) */
#endif /* XCHAL_HAVE_DENSITY */
#if XCHAL_HAVE_LOOPS
1:
loop a0, .Leq # loop forever (a4 is bigger than max iters)
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
addi a2, a2, 4 # advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq # if byte 0 is zero
bnone a8, a5, .Leq # if byte 1 is zero
bnone a8, a6, .Leq # if byte 2 is zero
bnone a8, a7, .Leq # if byte 3 is zero
addi a3, a3, 4 # advance s2 pointer
j 1b
#else /* !XCHAL_HAVE_LOOPS */
j .Lfirstword2
.Lnextword2:
addi a3, a3, 4 # advance s2 pointer
.Lfirstword2:
l32i a8, a2, 0 # get word from s1
l32i a9, a3, 0 # get word from s2
addi a2, a2, 4 # advance s1 pointer
bne a8, a9, .Lwne
bnone a8, a4, .Leq # if byte 0 is zero
bnone a8, a5, .Leq # if byte 1 is zero
bnone a8, a6, .Leq # if byte 2 is zero
bany a8, a7, .Lnextword2 # if byte 3 is zero
#endif /* !XCHAL_HAVE_LOOPS */
/* Words are equal; some byte is zero. */
.Leq: movi a2, 0 # return equal
RET(16)
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
bytes are zero, the return value can be determined by a simple
comparison. */
#if XCHAL_HAVE_BE
or a10, a8, a5
bnall a10, a7, .Lsomezero
bgeu a8, a9, .Lposreturn
movi a2, -1
RET(16)
.Lposreturn:
movi a2, 1
RET(16)
.Lsomezero: # There is probably some zero byte.
#endif /* XCHAL_HAVE_BE */
.Lwne: /* Words are not equal. */
xor a2, a8, a9 # get word with nonzero in byte that differs
bany a2, a4, .Ldiff0 # if byte 0 differs
movi a5, MASK1 # mask for byte 1
bnone a8, a4, .Leq # if byte 0 is zero
bany a2, a5, .Ldiff1 # if byte 1 differs
movi a6, MASK2 # mask for byte 2
bnone a8, a5, .Leq # if byte 1 is zero
bany a2, a6, .Ldiff2 # if byte 2 differs
bnone a8, a6, .Leq # if byte 2 is zero
#if XCHAL_HAVE_BE
.Ldiff3:
.Ldiff2:
.Ldiff1:
/* Byte 0 is equal (at least) and there is a difference before a zero
byte. Just subtract words to get the return value.
The high order equal bytes cancel, leaving room for the sign. */
sub a2, a8, a9
RET(16)
.Ldiff0:
/* Need to make room for the sign, so can't subtract whole words. */
extui a10, a8, 24, 8
extui a11, a9, 24, 8
sub a2, a10, a11
RET(16)
#else /* !XCHAL_HAVE_BE */
/* Little-endian is a little more difficult because can't subtract
whole words. */
.Ldiff3:
/* Bytes 0-2 are equal; byte 3 is different.
For little-endian need to have a sign bit for the difference. */
extui a10, a8, 24, 8
extui a11, a9, 24, 8
sub a2, a10, a11
RET(16)
.Ldiff0:
/* Byte 0 is different. */
extui a10, a8, 0, 8
extui a11, a9, 0, 8
sub a2, a10, a11
RET(16)
.Ldiff1:
/* Byte 0 is equal; byte 1 is different. */
extui a10, a8, 8, 8
extui a11, a9, 8, 8
sub a2, a10, a11
RET(16)
.Ldiff2:
/* Bytes 0-1 are equal; byte 2 is different. */
extui a10, a8, 16, 8
extui a11, a9, 16, 8
sub a2, a10, a11
RET(16)
#endif /* !XCHAL_HAVE_BE */
#endif /* FLIX3 */
#endif /* FLIX3 */
.end schedule
.size strcmp, . - strcmp

View File

@ -0,0 +1,243 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_strcpy.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Public Functions
****************************************************************************/
.section .text
.begin schedule
.align 4
.literal_position
.global strcpy
.type strcpy, @function
strcpy:
ENTRY(16)
/* a2 = dst, a3 = src */
mov a10, a2 # leave dst in return value register
movi a4, MASK0
movi a5, MASK1
movi a6, MASK2
movi a7, MASK3
bbsi.l a3, 0, .Lsrc1mod2
bbsi.l a3, 1, .Lsrc2mod4
.Lsrcaligned:
/* Check if the destination is aligned. */
movi a8, 3
bnone a10, a8, .Laligned
j .Ldstunaligned
.Lsrc1mod2: # src address is odd
l8ui a8, a3, 0 # get byte 0
addi a3, a3, 1 # advance src pointer
s8i a8, a10, 0 # store byte 0
beqz a8, 1f # if byte 0 is zero
addi a10, a10, 1 # advance dst pointer
bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
.Lsrc2mod4: # src address is 2 mod 4
l8ui a8, a3, 0 # get byte 0
/* 1-cycle interlock */
s8i a8, a10, 0 # store byte 0
beqz a8, 1f # if byte 0 is zero
l8ui a8, a3, 1 # get byte 0
addi a3, a3, 2 # advance src pointer
s8i a8, a10, 1 # store byte 0
addi a10, a10, 2 # advance dst pointer
bnez a8, .Lsrcaligned
1: RET(16)
/* dst is word-aligned; src is word-aligned. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
.Laligned:
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, .Lz3 # loop forever (almost anyway)
l32i a8, a3, 0 # get word from src
addi a3, a3, 4 # advance src pointer
bnone a8, a4, .Lz0 # if byte 0 is zero
bnone a8, a5, .Lz1 # if byte 1 is zero
bnone a8, a6, .Lz2 # if byte 2 is zero
s32i a8, a10, 0 # store word to dst
bnone a8, a7, .Lz3 # if byte 3 is zero
addi a10, a10, 4 # advance dst pointer
#else /* !XCHAL_HAVE_LOOPS */
1: addi a10, a10, 4 # advance dst pointer
.Laligned:
l32i a8, a3, 0 # get word from src
addi a3, a3, 4 # advance src pointer
bnone a8, a4, .Lz0 # if byte 0 is zero
bnone a8, a5, .Lz1 # if byte 1 is zero
bnone a8, a6, .Lz2 # if byte 2 is zero
s32i a8, a10, 0 # store word to dst
bany a8, a7, 1b # if byte 3 is zero
#endif /* !XCHAL_HAVE_LOOPS */
.Lz3: /* Byte 3 is zero. */
RET(16)
.Lz0: /* Byte 0 is zero. */
#if XCHAL_HAVE_BE
movi a8, 0
#endif
s8i a8, a10, 0
RET(16)
.Lz1: /* Byte 1 is zero. */
#if XCHAL_HAVE_BE
extui a8, a8, 16, 16
#endif
s16i a8, a10, 0
RET(16)
.Lz2: /* Byte 2 is zero. */
#if XCHAL_HAVE_BE
extui a8, a8, 16, 16
#endif
s16i a8, a10, 0
movi a8, 0
s8i a8, a10, 2
RET(16)
#if 1
/* For now just use byte copy loop for the unaligned destination case. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Ldstunaligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, 2f # loop forever (almost anyway)
#endif
1: l8ui a8, a3, 0
addi a3, a3, 1
s8i a8, a10, 0
addi a10, a10, 1
#if XCHAL_HAVE_LOOPS
beqz a8, 2f
#else
bnez a8, 1b
#endif
2: RET(16)
#else /* 0 */
/* This code is not functional yet. */
.Ldstunaligned:
l32i a9, a2, 0 # load word from dst
#if XCHAL_HAVE_BE
ssa8b a9 # rotate by dst alignment so that
src a9, a9, a9 # shift in loop will put back in place
ssa8l a9 # shift left by byte*8
#else
ssa8l a9 # rotate by dst alignment so that
src a9, a9, a9 # shift in loop will put back in place
ssa8b a9 # shift left by 32-byte*8
#endif
/* dst is word-aligned; src is unaligned. */
.Ldstunalignedloop:
l32i a8, a3, 0 # get word from src
/* 1-cycle interlock */
bnone a8, a4, .Lu0 # if byte 0 is zero
bnone a8, a5, .Lu1 # if byte 1 is zero
bnone a8, a6, .Lu2 # if byte 2 is zero
src a9, a8, a9 # combine last word and this word
s32i a9, a10, 0 # store word to dst
bnone a8, a7, .Lu3 # if byte 3 is nonzero, iterate
l32i a9, a3, 4 # get word from src
addi a3, a3, 8 # advance src pointer
bnone a9, a4, .Lu4 # if byte 0 is zero
bnone a9, a5, .Lu5 # if byte 1 is zero
bnone a9, a6, .Lu6 # if byte 2 is zero
src a8, a9, a8 # combine last word and this word
s32i a8, a10, 4 # store word to dst
addi a10, a10, 8 # advance dst pointer
bany a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
/* Byte 7 is zero. */
.Lu7: RET(16)
.Lu0: /* Byte 0 is zero. */
#if XCHAL_HAVE_BE
movi a8, 0
#endif
s8i a8, a10, 0
RET(16)
.Lu1: /* Byte 1 is zero. */
#if XCHAL_HAVE_BE
extui a8, a8, 16, 16
#endif
s16i a8, a10, 0
RET(16)
.Lu2: /* Byte 2 is zero. */
s16i a8, a10, 0
movi a8, 0
s8i a8, a10, 2
RET(16)
#endif /* 0 */
.end schedule
.size strcpy, . - strcpy

View File

@ -0,0 +1,123 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_strlen.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Public Functions
****************************************************************************/
.section .text
.begin schedule
.align 4
.literal_position
.global strlen
.type strlen, @function
strlen:
ENTRY(16)
/* a2 = s */
addi a3, a2, -4 # because we overincrement at the end
movi a4, MASK0
movi a5, MASK1
movi a6, MASK2
movi a7, MASK3
bbsi.l a2, 0, .L1mod2
bbsi.l a2, 1, .L2mod4
j .Laligned
.L1mod2: # address is odd
l8ui a8, a3, 4 # get byte 0
addi a3, a3, 1 # advance string pointer
beqz a8, .Lz3 # if byte 0 is zero
bbci.l a3, 1, .Laligned # if string pointer is now word-aligned
.L2mod4: # address is 2 mod 4
addi a3, a3, 2 # advance ptr for aligned access
l32i a8, a3, 0 # get word with first two bytes of string
bnone a8, a6, .Lz2 # if byte 2 (of word, not string) is zero
bany a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
/* Byte 3 is zero. */
addi a3, a3, 3 # point to zero byte
sub a2, a3, a2 # subtract to get length
RET(16)
/* String is word-aligned. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Laligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, .Lz3 # loop forever (almost anyway)
#endif
1: l32i a8, a3, 4 # get next word of string
addi a3, a3, 4 # advance string pointer
bnone a8, a4, .Lz0 # if byte 0 is zero
bnone a8, a5, .Lz1 # if byte 1 is zero
bnone a8, a6, .Lz2 # if byte 2 is zero
#if XCHAL_HAVE_LOOPS
bnone a8, a7, .Lz3 # if byte 3 is zero
#else
bany a8, a7, 1b # repeat if byte 3 is non-zero
#endif
.Lz3: /* Byte 3 is zero. */
addi a3, a3, 3 # point to zero byte
/* Fall through.... */
.Lz0: /* Byte 0 is zero. */
sub a2, a3, a2 # subtract to get length
RET(16)
.Lz1: /* Byte 1 is zero. */
addi a3, a3, 1 # point to zero byte
sub a2, a3, a2 # subtract to get length
RET(16)
.Lz2: /* Byte 2 is zero. */
addi a3, a3, 2 # point to zero byte
sub a2, a3, a2 # subtract to get length
RET(16)
.end schedule
.size strlen, . - strlen

View File

@ -0,0 +1,265 @@
/****************************************************************************
* libs/libc/machine/xtensa/arch_strncpy.S
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include "xtensa_asm.h"
#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>
/****************************************************************************
* Public Functions
****************************************************************************/
.section .text
.begin schedule
.align 4
.literal_position
__strncpy_aux:
.Lsrc1mod2: # src address is odd
l8ui a8, a3, 0 # get byte 0
addi a3, a3, 1 # advance src pointer
s8i a8, a10, 0 # store byte 0
addi a4, a4, -1 # decrement n
beqz a4, .Lret # if n is zero
addi a10, a10, 1 # advance dst pointer
beqz a8, .Lfill # if byte 0 is zero
bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
.Lsrc2mod4: # src address is 2 mod 4
l8ui a8, a3, 0 # get byte 0
addi a4, a4, -1 # decrement n
s8i a8, a10, 0 # store byte 0
beqz a4, .Lret # if n is zero
addi a10, a10, 1 # advance dst pointer
beqz a8, .Lfill # if byte 0 is zero
l8ui a8, a3, 1 # get byte 0
addi a3, a3, 2 # advance src pointer
s8i a8, a10, 0 # store byte 0
addi a4, a4, -1 # decrement n
beqz a4, .Lret # if n is zero
addi a10, a10, 1 # advance dst pointer
bnez a8, .Lsrcaligned
j .Lfill
.Lret:
RET(16)
.align 4
.global strncpy
.type strncpy, @function
strncpy:
ENTRY(16)
/* a2 = dst, a3 = src */
mov a10, a2 # leave dst in return value register
beqz a4, .Lret # if n is zero
movi a11, MASK0
movi a5, MASK1
movi a6, MASK2
movi a7, MASK3
bbsi.l a3, 0, .Lsrc1mod2
bbsi.l a3, 1, .Lsrc2mod4
.Lsrcaligned:
/* Check if the destination is aligned. */
movi a8, 3
bnone a10, a8, .Laligned
j .Ldstunaligned
/* Fill the dst with zeros -- n is at least 1. */
.Lfill:
movi a9, 0
bbsi.l a10, 0, .Lfill1mod2
bbsi.l a10, 1, .Lfill2mod4
.Lfillaligned:
blti a4, 4, .Lfillcleanup
/* Loop filling complete words with zero. */
#if XCHAL_HAVE_LOOPS
srai a8, a4, 2
loop a8, 1f
s32i a9, a10, 0
addi a10, a10, 4
1: slli a8, a8, 2
sub a4, a4, a8
#else /* !XCHAL_HAVE_LOOPS */
1: s32i a9, a10, 0
addi a10, a10, 4
addi a4, a4, -4
bgei a4, 4, 1b
#endif /* !XCHAL_HAVE_LOOPS */
beqz a4, 2f
.Lfillcleanup:
/* Fill leftover (1 to 3) bytes with zero. */
s8i a9, a10, 0 # store byte 0
addi a4, a4, -1 # decrement n
addi a10, a10, 1
bnez a4, .Lfillcleanup
2: RET(16)
.Lfill1mod2: # dst address is odd
s8i a9, a10, 0 # store byte 0
addi a4, a4, -1 # decrement n
beqz a4, 2b # if n is zero
addi a10, a10, 1 # advance dst pointer
bbci.l a10, 1, .Lfillaligned # if dst is now word-aligned
.Lfill2mod4: # dst address is 2 mod 4
s8i a9, a10, 0 # store byte 0
addi a4, a4, -1 # decrement n
beqz a4, 2b # if n is zero
s8i a9, a10, 1 # store byte 1
addi a4, a4, -1 # decrement n
beqz a4, 2b # if n is zero
addi a10, a10, 2 # advance dst pointer
j .Lfillaligned
/* dst is word-aligned; src is word-aligned; n is at least 1. */
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Laligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, 1f # loop forever (almost anyway)
blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
l32i a8, a3, 0 # get word from src
addi a3, a3, 4 # advance src pointer
bnone a8, a11, .Lz0 # if byte 0 is zero
bnone a8, a5, .Lz1 # if byte 1 is zero
bnone a8, a6, .Lz2 # if byte 2 is zero
s32i a8, a10, 0 # store word to dst
addi a4, a4, -4 # decrement n
addi a10, a10, 4 # advance dst pointer
bnone a8, a7, .Lfill # if byte 3 is zero
1:
#else /* !XCHAL_HAVE_LOOPS */
1: blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
l32i a8, a3, 0 # get word from src
addi a3, a3, 4 # advance src pointer
bnone a8, a11, .Lz0 # if byte 0 is zero
bnone a8, a5, .Lz1 # if byte 1 is zero
bnone a8, a6, .Lz2 # if byte 2 is zero
s32i a8, a10, 0 # store word to dst
addi a4, a4, -4 # decrement n
addi a10, a10, 4 # advance dst pointer
bany a8, a7, 1b # no zeroes
#endif /* !XCHAL_HAVE_LOOPS */
j .Lfill
.Lz0: /* Byte 0 is zero. */
#if XCHAL_HAVE_BE
movi a8, 0
#endif
s8i a8, a10, 0
addi a4, a4, -1 # decrement n
addi a10, a10, 1 # advance dst pointer
j .Lfill
.Lz1: /* Byte 1 is zero. */
#if XCHAL_HAVE_BE
extui a8, a8, 16, 16
#endif
s16i a8, a10, 0
addi a4, a4, -2 # decrement n
addi a10, a10, 2 # advance dst pointer
j .Lfill
.Lz2: /* Byte 2 is zero. */
#if XCHAL_HAVE_BE
extui a8, a8, 16, 16
#endif
s16i a8, a10, 0
movi a8, 0
s8i a8, a10, 2
addi a4, a4, -3 # decrement n
addi a10, a10, 3 # advance dst pointer
j .Lfill
.align 4
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
/* (2 mod 4) alignment for loop instruction */
#else
/* (1 mod 4) alignment for loop instruction */
.byte 0
.byte 0
#endif
#endif
.Ldstunaligned:
#if XCHAL_HAVE_LOOPS
#if XCHAL_HAVE_DENSITY
_movi.n a8, 0 # set up for the maximum loop count
#else
_movi a8, 0 # set up for the maximum loop count
#endif
loop a8, 2f # loop forever (almost anyway)
#endif
1: l8ui a8, a3, 0
addi a3, a3, 1
s8i a8, a10, 0
addi a4, a4, -1
beqz a4, 3f
addi a10, a10, 1
#if XCHAL_HAVE_LOOPS
beqz a8, 2f
#else
bnez a8, 1b
#endif
2: j .Lfill
3: RET(16)
.end schedule
.size strncpy, . - strncpy

View File

@ -0,0 +1,62 @@
/****************************************************************************
* libs/libc/machine/xtensa/xtensa_asm.h
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include <arch/chip/core-isa.h>
/****************************************************************************
* Assembly Language Macros
****************************************************************************/
.macro src_b r, w0, w1
#if XCHAL_HAVE_BE
src \r, \w0, \w1
#else
src \r, \w1, \w0
#endif
.endm
.macro ssa8 r
#if XCHAL_HAVE_BE
ssa8b \r
#else
ssa8l \r
#endif
.endm
/****************************************************************************
* Pre-processor Macros
****************************************************************************/
#if XCHAL_HAVE_BE
# define MASK0 0xff000000
# define MASK1 0x00ff0000
# define MASK2 0x0000ff00
# define MASK3 0x000000ff
#else
# define MASK0 0x000000ff
# define MASK1 0x0000ff00
# define MASK2 0x00ff0000
# define MASK3 0xff000000
#endif