libc:machine:xtensa:add xtensa libc implement
N/A Signed-off-by: zhuyanlin <zhuyanlin1@xiaomi.com>
This commit is contained in:
parent
580d17cc02
commit
cfcff5f570
@ -2,3 +2,46 @@
|
||||
# For a description of the syntax of this configuration file,
|
||||
# see the file kconfig-language.txt in the NuttX tools repository.
|
||||
#
|
||||
|
||||
config XTENSA_MEMCPY
|
||||
bool "Enable optimized memcpy() for XTENSA"
|
||||
select LIBC_ARCH_MEMCPY
|
||||
---help---
|
||||
Enable optimized XTENSA specific memcpy() library function
|
||||
|
||||
config XTENSA_MEMMOVE
|
||||
bool "Enable optimized memmove() for XTENSA"
|
||||
select LIBC_ARCH_MEMMOVE
|
||||
---help---
|
||||
Enable optimized XTENSA specific memmove() library function
|
||||
|
||||
config XTENSA_MEMSET
|
||||
bool "Enable optimized memset() for XTENSA"
|
||||
select LIBC_ARCH_MEMSET
|
||||
---help---
|
||||
Enable optimized XTENSA specific memset() library function
|
||||
|
||||
config XTENSA_STRCMP
|
||||
bool "Enable optimized strcmp() for XTENSA"
|
||||
select LIBC_ARCH_STRCMP
|
||||
---help---
|
||||
Enable optimized XTENSA specific strcmp() library function
|
||||
|
||||
config XTENSA_STRCPY
|
||||
bool "Enable optimized strcpy() for XTENSA"
|
||||
select LIBC_ARCH_STRCPY
|
||||
---help---
|
||||
Enable optimized XTENSA specific strcpy() library function
|
||||
|
||||
config XTENSA_STRLEN
|
||||
bool "Enable optimized strlen() for XTENSA"
|
||||
select LIBC_ARCH_STRLEN
|
||||
---help---
|
||||
Enable optimized XTENSA specific strlen() library function
|
||||
|
||||
config XTENSA_STRNCPY
|
||||
bool "Enable optimized strncpy() for XTENSA"
|
||||
select LIBC_ARCH_STRNCPY
|
||||
---help---
|
||||
Enable optimized XTENSA specific strncpy() library function
|
||||
|
||||
|
@ -19,10 +19,37 @@
|
||||
############################################################################
|
||||
|
||||
ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
|
||||
|
||||
CSRCS += arch_elf.c
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_MEMCPY),y)
|
||||
ASRCS += arch_memcpy.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
|
||||
ASRCS += arch_memmove.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_MEMSET),y)
|
||||
ASRCS += arch_memset.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_STRCPY),y)
|
||||
ASRCS += arch_strcpy.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_STRLEN),y)
|
||||
ASRCS += arch_strlen.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_STRNCPY),y)
|
||||
ASRCS += arch_strncpy.S
|
||||
endif
|
||||
|
||||
ifeq ($(CONFIG_XTENSA_STRCMP),y)
|
||||
ASRCS += arch_strcmp.S
|
||||
endif
|
||||
|
||||
DEPPATH += --dep-path machine/xtensa
|
||||
VPATH += :machine/xtensa
|
||||
|
||||
endif
|
||||
|
281
libs/libc/machine/xtensa/arch_memcpy.S
Normal file
281
libs/libc/machine/xtensa/arch_memcpy.S
Normal file
@ -0,0 +1,281 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_memcpy.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Pre-processor Macros
|
||||
****************************************************************************/
|
||||
|
||||
/* set to 1 when running on ISS (simulator) with the
|
||||
lint or ferret client, or 0 to save a few cycles */
|
||||
|
||||
#define SIM_CHECKS_ALIGNMENT 0
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.literal_position
|
||||
|
||||
.local .Ldst1mod2
|
||||
.local .Ldst2mod4
|
||||
.local .Lbytecopy
|
||||
|
||||
.align 4
|
||||
.global memcpy
|
||||
.type memcpy, @function
|
||||
memcpy:
|
||||
ENTRY(16)
|
||||
/* a2 = dst, a3 = src, a4 = len */
|
||||
|
||||
mov a5, a2 # copy dst so that a2 is return value
|
||||
bbsi.l a2, 0, .Ldst1mod2
|
||||
bbsi.l a2, 1, .Ldst2mod4
|
||||
.Ldstaligned:
|
||||
|
||||
/* Get number of loop iterations with 16B per iteration. */
|
||||
srli a7, a4, 4
|
||||
|
||||
/* Check if source is aligned. */
|
||||
slli a8, a3, 30
|
||||
bnez a8, .Lsrcunaligned
|
||||
|
||||
/* Destination and source are word-aligned, use word copy. */
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, 2f
|
||||
#else
|
||||
beqz a7, 2f
|
||||
slli a8, a7, 4
|
||||
add a8, a8, a3 # a8 = end of last 16B source chunk
|
||||
#endif
|
||||
1: l32i a6, a3, 0
|
||||
l32i a7, a3, 4
|
||||
s32i a6, a5, 0
|
||||
l32i a6, a3, 8
|
||||
|
||||
s32i a7, a5, 4
|
||||
l32i a7, a3, 12
|
||||
s32i a6, a5, 8
|
||||
addi a3, a3, 16
|
||||
s32i a7, a5, 12
|
||||
addi a5, a5, 16
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bltu a3, a8, 1b
|
||||
#endif
|
||||
|
||||
/* Copy any leftover pieces smaller than 16B. */
|
||||
2: bbci.l a4, 3, 3f
|
||||
|
||||
/* Copy 8 bytes. */
|
||||
l32i a6, a3, 0
|
||||
l32i a7, a3, 4
|
||||
addi a3, a3, 8
|
||||
s32i a6, a5, 0
|
||||
s32i a7, a5, 4
|
||||
addi a5, a5, 8
|
||||
|
||||
3: bbsi.l a4, 2, 4f
|
||||
bbsi.l a4, 1, 5f
|
||||
bbsi.l a4, 0, 6f
|
||||
RET(16)
|
||||
|
||||
# .align 4
|
||||
/* Copy 4 bytes. */
|
||||
4: l32i a6, a3, 0
|
||||
addi a3, a3, 4
|
||||
s32i a6, a5, 0
|
||||
addi a5, a5, 4
|
||||
bbsi.l a4, 1, 5f
|
||||
bbsi.l a4, 0, 6f
|
||||
RET(16)
|
||||
|
||||
/* Copy 2 bytes. */
|
||||
5: l16ui a6, a3, 0
|
||||
addi a3, a3, 2
|
||||
s16i a6, a5, 0
|
||||
addi a5, a5, 2
|
||||
bbsi.l a4, 0, 6f
|
||||
RET(16)
|
||||
|
||||
/* Copy 1 byte. */
|
||||
6: l8ui a6, a3, 0
|
||||
s8i a6, a5, 0
|
||||
|
||||
.Ldone:
|
||||
RET(16)
|
||||
|
||||
|
||||
/* Destination is aligned; source is unaligned. */
|
||||
|
||||
# .align 4
|
||||
.Lsrcunaligned:
|
||||
/* Avoid loading anything for zero-length copies. */
|
||||
beqz a4, .Ldone
|
||||
|
||||
/* Copy 16 bytes per iteration for word-aligned dst and
|
||||
unaligned src. */
|
||||
ssa8 a3 # set shift amount from byte offset
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
srli a11, a8, 30 # save unalignment offset for below
|
||||
sub a3, a3, a11 # align a3
|
||||
#endif
|
||||
l32i a6, a3, 0 # load first word
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, 2f
|
||||
#else
|
||||
beqz a7, 2f
|
||||
slli a10, a7, 4
|
||||
add a10, a10, a3 # a10 = end of last 16B source chunk
|
||||
#endif
|
||||
1: l32i a7, a3, 4
|
||||
l32i a8, a3, 8
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
l32i a9, a3, 12
|
||||
src_b a7, a7, a8
|
||||
s32i a7, a5, 4
|
||||
l32i a6, a3, 16
|
||||
src_b a8, a8, a9
|
||||
s32i a8, a5, 8
|
||||
addi a3, a3, 16
|
||||
src_b a9, a9, a6
|
||||
s32i a9, a5, 12
|
||||
addi a5, a5, 16
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bltu a3, a10, 1b
|
||||
#endif
|
||||
|
||||
2: bbci.l a4, 3, 3f
|
||||
|
||||
/* Copy 8 bytes. */
|
||||
l32i a7, a3, 4
|
||||
l32i a8, a3, 8
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
addi a3, a3, 8
|
||||
src_b a7, a7, a8
|
||||
s32i a7, a5, 4
|
||||
addi a5, a5, 8
|
||||
mov a6, a8
|
||||
|
||||
3: bbci.l a4, 2, 4f
|
||||
|
||||
/* Copy 4 bytes. */
|
||||
l32i a7, a3, 4
|
||||
addi a3, a3, 4
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
addi a5, a5, 4
|
||||
mov a6, a7
|
||||
4:
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
add a3, a3, a11 # readjust a3 with correct misalignment
|
||||
#endif
|
||||
bbsi.l a4, 1, 5f
|
||||
bbsi.l a4, 0, 6f
|
||||
RET(16)
|
||||
|
||||
/* Copy 2 bytes. */
|
||||
5: l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a3, a3, 2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
addi a5, a5, 2
|
||||
bbsi.l a4, 0, 6f
|
||||
RET(16)
|
||||
|
||||
/* Copy 1 byte. */
|
||||
6: l8ui a6, a3, 0
|
||||
s8i a6, a5, 0
|
||||
RET(16)
|
||||
|
||||
|
||||
# .align XCHAL_INST_FETCH_WIDTH
|
||||
__memcpy_aux:
|
||||
|
||||
/* Skip bytes to get proper alignment for three-byte loop */
|
||||
# .skip XCHAL_INST_FETCH_WIDTH - 3
|
||||
|
||||
.Lbytecopy:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a4, 2f
|
||||
#else
|
||||
beqz a4, 2f
|
||||
add a7, a3, a4 # a7 = end address for source
|
||||
#endif
|
||||
1: l8ui a6, a3, 0
|
||||
addi a3, a3, 1
|
||||
s8i a6, a5, 0
|
||||
addi a5, a5, 1
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bltu a3, a7, 1b
|
||||
#endif
|
||||
2: RET(16)
|
||||
|
||||
|
||||
/* Destination is unaligned. */
|
||||
|
||||
# .align 4
|
||||
.Ldst1mod2: # dst is only byte aligned
|
||||
|
||||
/* Do short copies byte-by-byte. */
|
||||
bltui a4, 7, .Lbytecopy
|
||||
|
||||
/* Copy 1 byte. */
|
||||
l8ui a6, a3, 0
|
||||
addi a3, a3, 1
|
||||
addi a4, a4, -1
|
||||
s8i a6, a5, 0
|
||||
addi a5, a5, 1
|
||||
|
||||
/* Return to main algorithm if dst is now aligned. */
|
||||
bbci.l a5, 1, .Ldstaligned
|
||||
|
||||
.Ldst2mod4: # dst has 16-bit alignment
|
||||
|
||||
/* Do short copies byte-by-byte. */
|
||||
bltui a4, 6, .Lbytecopy
|
||||
|
||||
/* Copy 2 bytes. */
|
||||
l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a3, a3, 2
|
||||
addi a4, a4, -2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
addi a5, a5, 2
|
||||
|
||||
/* dst is now aligned; return to main algorithm. */
|
||||
j .Ldstaligned
|
||||
|
||||
.end schedule
|
||||
|
||||
.size memcpy, . - memcpy
|
480
libs/libc/machine/xtensa/arch_memmove.S
Normal file
480
libs/libc/machine/xtensa/arch_memmove.S
Normal file
@ -0,0 +1,480 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_memset.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Pre-processor Macros
|
||||
****************************************************************************/
|
||||
|
||||
/* set to 1 when running on ISS (simulator) with the
|
||||
lint or ferret client, or 0 to save a few cycles */
|
||||
|
||||
#define SIM_CHECKS_ALIGNMENT 0
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
.text
|
||||
.begin schedule
|
||||
.global memmove
|
||||
|
||||
/*
|
||||
* Byte by byte copy
|
||||
*/
|
||||
.align 4
|
||||
.byte 0 # 1 mod 4 alignment for LOOPNEZ
|
||||
# (0 mod 4 alignment for LBEG)
|
||||
.Lbytecopy:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a4, .Lbytecopydone
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a4, .Lbytecopydone
|
||||
add a7, a3, a4 # a7 = end address for source
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Lnextbyte:
|
||||
l8ui a6, a3, 0
|
||||
addi a3, a3, 1
|
||||
s8i a6, a5, 0
|
||||
addi a5, a5, 1
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Lbytecopydone:
|
||||
RET(16)
|
||||
|
||||
/*
|
||||
* Destination is unaligned
|
||||
*/
|
||||
|
||||
.align 4
|
||||
.Ldst1mod2: # dst is only byte aligned
|
||||
_bltui a4, 7, .Lbytecopy # do short copies byte by byte
|
||||
|
||||
# copy 1 byte
|
||||
l8ui a6, a3, 0
|
||||
addi a3, a3, 1
|
||||
addi a4, a4, -1
|
||||
s8i a6, a5, 0
|
||||
addi a5, a5, 1
|
||||
_bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
|
||||
# return to main algorithm
|
||||
.Ldst2mod4: # dst 16-bit aligned
|
||||
# copy 2 bytes
|
||||
_bltui a4, 6, .Lbytecopy # do short copies byte by byte
|
||||
l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a3, a3, 2
|
||||
addi a4, a4, -2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
addi a5, a5, 2
|
||||
j .Ldstaligned # dst is now aligned, return to main algorithm
|
||||
|
||||
.Lcommon:
|
||||
bbsi.l a2, 0, .Ldst1mod2 # if dst is 1 mod 2
|
||||
bbsi.l a2, 1, .Ldst2mod4 # if dst is 2 mod 4
|
||||
.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
|
||||
srli a7, a4, 4 # number of loop iterations with 16B
|
||||
# per iteration
|
||||
movi a8, 3 # if source is not aligned,
|
||||
bany a3, a8, .Lsrcunaligned # then use shifting copy
|
||||
/*
|
||||
* Destination and source are word-aligned, use word copy.
|
||||
*/
|
||||
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, .Loop1done
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a7, .Loop1done
|
||||
slli a8, a7, 4
|
||||
add a8, a8, a3 # a8 = end of last 16B source chunk
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Loop1:
|
||||
l32i a6, a3, 0
|
||||
l32i a7, a3, 4
|
||||
s32i a6, a5, 0
|
||||
l32i a6, a3, 8
|
||||
s32i a7, a5, 4
|
||||
l32i a7, a3, 12
|
||||
s32i a6, a5, 8
|
||||
addi a3, a3, 16
|
||||
s32i a7, a5, 12
|
||||
addi a5, a5, 16
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a8, .Loop1 # continue loop if a3:src != a8:src_end
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Loop1done:
|
||||
bbci.l a4, 3, .L2
|
||||
# copy 8 bytes
|
||||
l32i a6, a3, 0
|
||||
l32i a7, a3, 4
|
||||
addi a3, a3, 8
|
||||
s32i a6, a5, 0
|
||||
s32i a7, a5, 4
|
||||
addi a5, a5, 8
|
||||
.L2:
|
||||
bbsi.l a4, 2, .L3
|
||||
bbsi.l a4, 1, .L4
|
||||
bbsi.l a4, 0, .L5
|
||||
RET(16)
|
||||
.L3:
|
||||
# copy 4 bytes
|
||||
l32i a6, a3, 0
|
||||
addi a3, a3, 4
|
||||
s32i a6, a5, 0
|
||||
addi a5, a5, 4
|
||||
bbsi.l a4, 1, .L4
|
||||
bbsi.l a4, 0, .L5
|
||||
RET(16)
|
||||
.L4:
|
||||
# copy 2 bytes
|
||||
l16ui a6, a3, 0
|
||||
addi a3, a3, 2
|
||||
s16i a6, a5, 0
|
||||
addi a5, a5, 2
|
||||
bbsi.l a4, 0, .L5
|
||||
RET(16)
|
||||
.L5:
|
||||
# copy 1 byte
|
||||
l8ui a6, a3, 0
|
||||
s8i a6, a5, 0
|
||||
RET(16)
|
||||
|
||||
/*
|
||||
* Destination is aligned, Source is unaligned
|
||||
*/
|
||||
|
||||
.align 4
|
||||
.Lsrcunaligned:
|
||||
_beqz a4, .Ldone # avoid loading anything for zero-length copies
|
||||
# copy 16 bytes per iteration for word-aligned dst and unaligned src
|
||||
ssa8 a3 # set shift amount from byte offset
|
||||
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
and a11, a3, a8 # save unalignment offset for below
|
||||
sub a3, a3, a11 # align a3
|
||||
#endif
|
||||
l32i a6, a3, 0 # load first word
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, .Loop2done
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a7, .Loop2done
|
||||
slli a10, a7, 4
|
||||
add a10, a10, a3 # a10 = end of last 16B source chunk
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Loop2:
|
||||
l32i a7, a3, 4
|
||||
l32i a8, a3, 8
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
l32i a9, a3, 12
|
||||
src_b a7, a7, a8
|
||||
s32i a7, a5, 4
|
||||
l32i a6, a3, 16
|
||||
src_b a8, a8, a9
|
||||
s32i a8, a5, 8
|
||||
addi a3, a3, 16
|
||||
src_b a9, a9, a6
|
||||
s32i a9, a5, 12
|
||||
addi a5, a5, 16
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Loop2done:
|
||||
bbci.l a4, 3, .L12
|
||||
# copy 8 bytes
|
||||
l32i a7, a3, 4
|
||||
l32i a8, a3, 8
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
addi a3, a3, 8
|
||||
src_b a7, a7, a8
|
||||
s32i a7, a5, 4
|
||||
addi a5, a5, 8
|
||||
mov a6, a8
|
||||
.L12:
|
||||
bbci.l a4, 2, .L13
|
||||
# copy 4 bytes
|
||||
l32i a7, a3, 4
|
||||
addi a3, a3, 4
|
||||
src_b a6, a6, a7
|
||||
s32i a6, a5, 0
|
||||
addi a5, a5, 4
|
||||
mov a6, a7
|
||||
.L13:
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
add a3, a3, a11 # readjust a3 with correct misalignment
|
||||
#endif
|
||||
bbsi.l a4, 1, .L14
|
||||
bbsi.l a4, 0, .L15
|
||||
.Ldone: RET(16)
|
||||
.L14:
|
||||
# copy 2 bytes
|
||||
l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a3, a3, 2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
addi a5, a5, 2
|
||||
bbsi.l a4, 0, .L15
|
||||
RET(16)
|
||||
.L15:
|
||||
# copy 1 byte
|
||||
l8ui a6, a3, 0
|
||||
s8i a6, a5, 0
|
||||
RET(16)
|
||||
|
||||
/*
|
||||
* Byte by byte copy
|
||||
*/
|
||||
.align 4
|
||||
.byte 0 # 1 mod 4 alignment for LOOPNEZ
|
||||
# (0 mod 4 alignment for LBEG)
|
||||
.Lbackbytecopy:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a4, .Lbackbytecopydone
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a4, .Lbackbytecopydone
|
||||
sub a7, a3, a4 # a7 = start address for source
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Lbacknextbyte:
|
||||
addi a3, a3, -1
|
||||
l8ui a6, a3, 0
|
||||
addi a5, a5, -1
|
||||
s8i a6, a5, 0
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a7, .Lbacknextbyte # continue loop if
|
||||
# $a3:src != $a7:src_start
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.Lbackbytecopydone:
|
||||
RET(16)
|
||||
|
||||
/*
|
||||
* Destination is unaligned
|
||||
*/
|
||||
|
||||
.align 4
|
||||
.Lbackdst1mod2: # dst is only byte aligned
|
||||
_bltui a4, 7, .Lbackbytecopy # do short copies byte by byte
|
||||
|
||||
# copy 1 byte
|
||||
addi a3, a3, -1
|
||||
l8ui a6, a3, 0
|
||||
addi a5, a5, -1
|
||||
s8i a6, a5, 0
|
||||
addi a4, a4, -1
|
||||
_bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
|
||||
# return to main algorithm
|
||||
.Lbackdst2mod4: # dst 16-bit aligned
|
||||
# copy 2 bytes
|
||||
_bltui a4, 6, .Lbackbytecopy # do short copies byte by byte
|
||||
addi a3, a3, -2
|
||||
l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a5, a5, -2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
addi a4, a4, -2
|
||||
j .Lbackdstaligned # dst is now aligned,
|
||||
# return to main algorithm
|
||||
|
||||
.align 4
|
||||
memmove:
|
||||
|
||||
ENTRY(16)
|
||||
# a2/ dst, a3/ src, a4/ len
|
||||
mov a5, a2 # copy dst so that a2 is return value
|
||||
.Lmovecommon:
|
||||
sub a6, a5, a3
|
||||
bgeu a6, a4, .Lcommon
|
||||
|
||||
add a5, a5, a4
|
||||
add a3, a3, a4
|
||||
|
||||
bbsi.l a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
|
||||
bbsi.l a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
|
||||
.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
|
||||
srli a7, a4, 4 # number of loop iterations with 16B
|
||||
# per iteration
|
||||
movi a8, 3 # if source is not aligned,
|
||||
bany a3, a8, .Lbacksrcunaligned # then use shifting copy
|
||||
/*
|
||||
* Destination and source are word-aligned, use word copy.
|
||||
*/
|
||||
# copy 16 bytes per iteration for word-aligned dst and word-aligned src
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, .backLoop1done
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a7, .backLoop1done
|
||||
slli a8, a7, 4
|
||||
sub a8, a3, a8 # a8 = start of first 16B source chunk
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.backLoop1:
|
||||
addi a3, a3, -16
|
||||
l32i a7, a3, 12
|
||||
l32i a6, a3, 8
|
||||
addi a5, a5, -16
|
||||
s32i a7, a5, 12
|
||||
l32i a7, a3, 4
|
||||
s32i a6, a5, 8
|
||||
l32i a6, a3, 0
|
||||
s32i a7, a5, 4
|
||||
s32i a6, a5, 0
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a8, .backLoop1 # continue loop if a3:src != a8:src_start
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.backLoop1done:
|
||||
bbci.l a4, 3, .Lback2
|
||||
# copy 8 bytes
|
||||
addi a3, a3, -8
|
||||
l32i a6, a3, 0
|
||||
l32i a7, a3, 4
|
||||
addi a5, a5, -8
|
||||
s32i a6, a5, 0
|
||||
s32i a7, a5, 4
|
||||
.Lback2:
|
||||
bbsi.l a4, 2, .Lback3
|
||||
bbsi.l a4, 1, .Lback4
|
||||
bbsi.l a4, 0, .Lback5
|
||||
RET(16)
|
||||
.Lback3:
|
||||
# copy 4 bytes
|
||||
addi a3, a3, -4
|
||||
l32i a6, a3, 0
|
||||
addi a5, a5, -4
|
||||
s32i a6, a5, 0
|
||||
bbsi.l a4, 1, .Lback4
|
||||
bbsi.l a4, 0, .Lback5
|
||||
RET(16)
|
||||
.Lback4:
|
||||
# copy 2 bytes
|
||||
addi a3, a3, -2
|
||||
l16ui a6, a3, 0
|
||||
addi a5, a5, -2
|
||||
s16i a6, a5, 0
|
||||
bbsi.l a4, 0, .Lback5
|
||||
RET(16)
|
||||
.Lback5:
|
||||
# copy 1 byte
|
||||
addi a3, a3, -1
|
||||
l8ui a6, a3, 0
|
||||
addi a5, a5, -1
|
||||
s8i a6, a5, 0
|
||||
RET(16)
|
||||
|
||||
/*
|
||||
* Destination is aligned, Source is unaligned
|
||||
*/
|
||||
|
||||
.align 4
|
||||
.Lbacksrcunaligned:
|
||||
_beqz a4, .Lbackdone # avoid loading anything for zero-length copies
|
||||
# copy 16 bytes per iteration for word-aligned dst and unaligned src
|
||||
ssa8 a3 # set shift amount from byte offset
|
||||
#define SIM_CHECKS_ALIGNMENT 1 /* set to 1 when running on ISS with
|
||||
* the lint or ferret client, or 0
|
||||
* to save a few cycles */
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
and a11, a3, a8 # save unalignment offset for below
|
||||
sub a3, a3, a11 # align a3
|
||||
#endif
|
||||
l32i a6, a3, 0 # load first word
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, .backLoop2done
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
beqz a7, .backLoop2done
|
||||
slli a10, a7, 4
|
||||
sub a10, a3, a10 # a10 = start of first 16B source chunk
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.backLoop2:
|
||||
addi a3, a3, -16
|
||||
l32i a7, a3, 12
|
||||
l32i a8, a3, 8
|
||||
addi a5, a5, -16
|
||||
src_b a6, a7, a6
|
||||
s32i a6, a5, 12
|
||||
l32i a9, a3, 4
|
||||
src_b a7, a8, a7
|
||||
s32i a7, a5, 8
|
||||
l32i a6, a3, 0
|
||||
src_b a8, a9, a8
|
||||
s32i a8, a5, 4
|
||||
src_b a9, a6, a9
|
||||
s32i a9, a5, 0
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
.backLoop2done:
|
||||
bbci.l a4, 3, .Lback12
|
||||
# copy 8 bytes
|
||||
addi a3, a3, -8
|
||||
l32i a7, a3, 4
|
||||
l32i a8, a3, 0
|
||||
addi a5, a5, -8
|
||||
src_b a6, a7, a6
|
||||
s32i a6, a5, 4
|
||||
src_b a7, a8, a7
|
||||
s32i a7, a5, 0
|
||||
mov a6, a8
|
||||
.Lback12:
|
||||
bbci.l a4, 2, .Lback13
|
||||
# copy 4 bytes
|
||||
addi a3, a3, -4
|
||||
l32i a7, a3, 0
|
||||
addi a5, a5, -4
|
||||
src_b a6, a7, a6
|
||||
s32i a6, a5, 0
|
||||
mov a6, a7
|
||||
.Lback13:
|
||||
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
|
||||
add a3, a3, a11 # readjust a3 with correct misalignment
|
||||
#endif
|
||||
bbsi.l a4, 1, .Lback14
|
||||
bbsi.l a4, 0, .Lback15
|
||||
.Lbackdone:
|
||||
RET(16)
|
||||
.Lback14:
|
||||
# copy 2 bytes
|
||||
addi a3, a3, -2
|
||||
l8ui a6, a3, 0
|
||||
l8ui a7, a3, 1
|
||||
addi a5, a5, -2
|
||||
s8i a6, a5, 0
|
||||
s8i a7, a5, 1
|
||||
bbsi.l a4, 0, .Lback15
|
||||
RET(16)
|
||||
.Lback15:
|
||||
# copy 1 byte
|
||||
addi a3, a3, -1
|
||||
addi a5, a5, -1
|
||||
l8ui a6, a3, 0
|
||||
s8i a6, a5, 0
|
||||
RET(16)
|
||||
|
||||
.end schedule
|
||||
.size memmove, . - memmove
|
179
libs/libc/machine/xtensa/arch_memset.S
Normal file
179
libs/libc/machine/xtensa/arch_memset.S
Normal file
@ -0,0 +1,179 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_memset.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
/* void *memset (void *dst, int c, size_t length)
|
||||
|
||||
The algorithm is as follows:
|
||||
|
||||
Create a word with c in all byte positions.
|
||||
|
||||
If the destination is aligned, set 16B chunks with a loop, and then
|
||||
finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
|
||||
|
||||
If the destination is unaligned, align it by conditionally
|
||||
setting 1B and/or 2B and then go to aligned case.
|
||||
|
||||
This code tries to use fall-through branches for the common
|
||||
case of an aligned destination (except for the branches to
|
||||
the alignment labels). */
|
||||
|
||||
|
||||
/* Byte-by-byte set. */
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.literal_position
|
||||
|
||||
.local .Lbyteset
|
||||
.local .Ldst1mod2
|
||||
.local .Ldst2mod4
|
||||
|
||||
.align 4
|
||||
.global memset
|
||||
.type memset, @function
|
||||
memset:
|
||||
ENTRY(16)
|
||||
/* a2 = dst, a3 = c, a4 = length */
|
||||
|
||||
/* Duplicate character into all bytes of word. */
|
||||
extui a3, a3, 0, 8
|
||||
slli a7, a3, 8
|
||||
or a3, a3, a7
|
||||
slli a7, a3, 16
|
||||
or a3, a3, a7
|
||||
|
||||
mov a5, a2 // copy dst so that a2 is return value
|
||||
|
||||
/* Check if dst is unaligned. */
|
||||
bbsi.l a2, 0, .Ldst1mod2
|
||||
bbsi.l a2, 1, .Ldst2mod4
|
||||
j .Ldstaligned
|
||||
|
||||
.Ldst1mod2: // dst is only byte aligned
|
||||
|
||||
/* Do short sizes byte-by-byte. */
|
||||
bltui a4, 8, .Lbyteset
|
||||
|
||||
/* Set 1 byte. */
|
||||
s8i a3, a5, 0
|
||||
addi a5, a5, 1
|
||||
addi a4, a4, -1
|
||||
|
||||
/* Now retest if dst is aligned. */
|
||||
bbci.l a5, 1, .Ldstaligned
|
||||
|
||||
.Ldst2mod4: // dst has 16-bit alignment
|
||||
|
||||
/* Do short sizes byte-by-byte. */
|
||||
bltui a4, 8, .Lbyteset
|
||||
|
||||
/* Set 2 bytes. */
|
||||
s16i a3, a5, 0
|
||||
addi a5, a5, 2
|
||||
addi a4, a4, -2
|
||||
|
||||
/* dst is now aligned; fall through to main algorithm */
|
||||
|
||||
.Ldstaligned:
|
||||
|
||||
/* Get number of loop iterations with 16B per iteration. */
|
||||
srli a7, a4, 4
|
||||
|
||||
/* Destination is word-aligned. */
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a7, 2f
|
||||
#else
|
||||
beqz a7, 2f
|
||||
slli a6, a7, 4
|
||||
add a6, a6, a5 // a6 = end of last 16B chunk
|
||||
#endif
|
||||
/* Set 16 bytes per iteration. */
|
||||
1: s32i a3, a5, 0
|
||||
s32i a3, a5, 4
|
||||
s32i a3, a5, 8
|
||||
s32i a3, a5, 12
|
||||
addi a5, a5, 16
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bltu a5, a6, 1b
|
||||
#endif
|
||||
|
||||
/* Set any leftover pieces smaller than 16B. */
|
||||
2: bbci.l a4, 3, 3f
|
||||
|
||||
/* Set 8 bytes. */
|
||||
s32i a3, a5, 0
|
||||
s32i a3, a5, 4
|
||||
addi a5, a5, 8
|
||||
|
||||
3: bbci.l a4, 2, 4f
|
||||
|
||||
/* Set 4 bytes. */
|
||||
s32i a3, a5, 0
|
||||
addi a5, a5, 4
|
||||
|
||||
4: bbci.l a4, 1, 5f
|
||||
|
||||
/* Set 2 bytes. */
|
||||
s16i a3, a5, 0
|
||||
addi a5, a5, 2
|
||||
|
||||
5: bbci.l a4, 0, 6f
|
||||
|
||||
/* Set 1 byte. */
|
||||
s8i a3, a5, 0
|
||||
6: RET(16)
|
||||
|
||||
|
||||
// .align XCHAL_INST_FETCH_WIDTH
|
||||
__memset_aux:
|
||||
|
||||
/* Skip bytes to get proper alignment for three-byte loop */
|
||||
// .skip XCHAL_INST_FETCH_WIDTH - 3
|
||||
|
||||
.Lbyteset:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
loopnez a4, 2f
|
||||
#else
|
||||
beqz a4, 2f
|
||||
add a6, a5, a4 // a6 = ending address
|
||||
#endif
|
||||
1: s8i a3, a5, 0
|
||||
addi a5, a5, 1
|
||||
#if !XCHAL_HAVE_LOOPS
|
||||
bltu a5, a6, 1b
|
||||
#endif
|
||||
2: RET(16)
|
||||
|
||||
.end schedule
|
||||
|
||||
.size memset, . - memset
|
767
libs/libc/machine/xtensa/arch_strcmp.S
Normal file
767
libs/libc/machine/xtensa/arch_strcmp.S
Normal file
@ -0,0 +1,767 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_strcmp.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Pre-processor Macros
|
||||
****************************************************************************/
|
||||
|
||||
#define MASK4 0x40404040
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.align 4
|
||||
.literal_position
|
||||
|
||||
.global strcmp
|
||||
.type strcmp,@function
|
||||
.align 4
|
||||
|
||||
strcmp:
|
||||
|
||||
#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_FLIX3
|
||||
/* Fast version for FLIX3 Little Endian */
|
||||
|
||||
|
||||
ENTRY(16)
|
||||
/* a2 = s1, a3 = s2 */
|
||||
|
||||
l8ui a8, a2, 0 # byte 0 from s1
|
||||
l8ui a9, a3, 0 # byte 0 from s2
|
||||
movi a10, 3 # mask
|
||||
movi a5, 0xfffffffc
|
||||
or a11, a2, a3
|
||||
movi a4, MASK0 # mask for byte 0
|
||||
movi a7, MASK4
|
||||
addi a3, a3, -8
|
||||
addi a2, a2, -8
|
||||
and a5, a5, a2
|
||||
bne.w18 a8, a9, .Lretdiff
|
||||
l32i a8, a5, 8 # get word from aligned variant of s1
|
||||
|
||||
bany.w18 a11, a10, .Lnot_aligned
|
||||
|
||||
/* s1 is word-aligned; s2 is word-aligned.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop. */
|
||||
|
||||
/* New algorithm, relying on the fact that all normal ASCII is between
|
||||
32 and 127.
|
||||
|
||||
Rather than check all bytes for zero:
|
||||
Take one word (4 bytes). Call it w1.
|
||||
Shift w1 left by one into w1'.
|
||||
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
|
||||
Check that all 4 bit 6's (one for each byte) are one:
|
||||
If they are, we are definitely not done.
|
||||
If they are not, we are probably done, but need to check for zero. */
|
||||
|
||||
.Laligned:
|
||||
/* Loop forever */
|
||||
1:
|
||||
loop a0, .Laligned_done
|
||||
|
||||
/* First unrolled loop body. */
|
||||
l32i a9, a3, 8 # get word from s2
|
||||
addi a3, a3, 8 # advance s2 pointer
|
||||
slli a5, a8, 1
|
||||
or a10, a8, a5
|
||||
{l32i a11, a2, 12 # get word from s1+4
|
||||
bne.w18 a8, a9, .Lwne2}
|
||||
l32i a9, a3, 4 # get word from s2+4
|
||||
bnall.w18 a10, a7, .Lprobeq
|
||||
|
||||
/* Second unrolled loop body. */
|
||||
slli a5, a11, 1
|
||||
or a10, a11, a5
|
||||
addi a2, a2, 8 # advance s1 pointer
|
||||
mov a8, a11
|
||||
bne.w18 a11, a9, .Lwne2
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
bnall.w18 a10, a7, .Lprobeq2
|
||||
|
||||
.Laligned_done:
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
j 1b
|
||||
|
||||
.Lnot_aligned:
|
||||
xor a11, a2, a3 # compare low two bits of s1 and s2
|
||||
bany a11, a10, .Lunaligned # if they have different alignment
|
||||
|
||||
/* s1/s2 are not word-aligned. */
|
||||
movi a5, 0xfffffffc
|
||||
addi a2, a2, 1 # advance s1
|
||||
beqz a9, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
and a6, a2, a5
|
||||
l32i a8, a6, 8 # get word from s1
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 8 # byte 1 from s1
|
||||
l8ui a9, a3, 8 # byte 1 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
and a6, a2, a5
|
||||
l32i a8, a6, 8 # get word from s1
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 8 # byte 2 from s1
|
||||
l8ui a9, a3, 8 # byte 2 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
j .Laligned
|
||||
|
||||
/* s1 and s2 have different alignment.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop.
|
||||
|
||||
Note: It is important for this unaligned case to come before the
|
||||
code for aligned strings, because otherwise some of the branches
|
||||
above cannot reach and have to be transformed to branches around
|
||||
jumps. The unaligned code is smaller and the branches can reach
|
||||
over it. */
|
||||
|
||||
.Lunaligned:
|
||||
movi.n a8, 0 # set up for the maximum loop count
|
||||
loop a8, .Lretdiff # loop forever (almost anyway)
|
||||
l8ui a8, a2, 8
|
||||
l8ui a9, a3, 8
|
||||
addi a2, a2, 1
|
||||
bne a8, a9, .Lretdiff
|
||||
addi a3, a3, 1
|
||||
beqz a8, .Lretdiff
|
||||
.Lretdiff:
|
||||
sub a2, a8, a9
|
||||
RET(16)
|
||||
|
||||
|
||||
.Lprobeq2:
|
||||
/* Adjust pointers to account for the loop unrolling. */
|
||||
mov a8, a11
|
||||
addi a2, a2, -4
|
||||
addi a3, a3, 4
|
||||
|
||||
/* align (0 mod 4) */
|
||||
.Lprobeq:
|
||||
/* Words are probably equal, but check for sure.
|
||||
If not, loop over the rest of string using normal algorithm. */
|
||||
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
movi a7, MASK3 # mask for byte 3
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
/* align (1 mod 4) */
|
||||
addi.n a2, a2, 12 # advance s1 pointer
|
||||
addi.n a3, a3, 4 # advance s2 pointer
|
||||
/* align (1 mod 4) or (2 mod 4) */
|
||||
1:
|
||||
loop a0, .Lend # loop forever (a4 is bigger than max iters)
|
||||
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
bne a8, a9, .Lwne
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
.Lend:
|
||||
j 1b
|
||||
|
||||
/* Words are equal; some byte is zero. */
|
||||
.Leq: movi a2, 0 # return equal
|
||||
RET(16)
|
||||
|
||||
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
|
||||
bytes are zero, the return value can be determined by a simple
|
||||
comparison. */
|
||||
.Lwne: /* Words are not equal. */
|
||||
xor a2, a8, a9 # get word with nonzero in byte that differs
|
||||
extui a10, a8, 0, 8
|
||||
extui a11, a9, 0, 8
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
bany.w18 a2, a4, .Ldiff0 # if byte 0 differs
|
||||
|
||||
bnone.w18 a8, a4, .Leq # if byte 0 is zero
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bany.w18 a2, a5, .Ldiff1 # if byte 1 differs
|
||||
extui a10, a8, 24, 8
|
||||
bnone.w18 a8, a5, .Leq # if byte 1 is zero
|
||||
extui a11, a9, 24, 8
|
||||
bany.w18 a2, a6, .Ldiff2 # if byte 2 differs
|
||||
sub a2, a10, a11
|
||||
bnone.w18 a8, a6, .Leq # if byte 2 is zero
|
||||
/* Little-endian is a little more difficult because can't subtract
|
||||
whole words. */
|
||||
.Ldiff3:
|
||||
/* Bytes 0-2 are equal; byte 3 is different.
|
||||
For little-endian need to have a sign bit for the difference. */
|
||||
RET(16)
|
||||
.Ldiff0:
|
||||
/* Byte 0 is different. */
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff1:
|
||||
/* Byte 0 is equal; byte 1 is different. */
|
||||
extui a10, a8, 8, 8
|
||||
extui a11, a9, 8, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff2:
|
||||
/* Bytes 0-1 are equal; byte 2 is different. */
|
||||
extui a10, a8, 16, 8
|
||||
extui a11, a9, 16, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
#else
|
||||
#if XCHAL_HAVE_LOOPS && XCHAL_HAVE_DENSITY && !XCHAL_HAVE_BE && XCHAL_HAVE_PDX4
|
||||
/* Fast version for FLIX3 Little Endian */
|
||||
|
||||
|
||||
ENTRY(16)
|
||||
/* a2 = s1, a3 = s2 */
|
||||
|
||||
l8ui a8, a2, 0 # byte 0 from s1
|
||||
l8ui a9, a3, 0 # byte 0 from s2
|
||||
movi a10, 3 # mask
|
||||
movi a5, 0xfffffffc
|
||||
or a11, a2, a3
|
||||
movi a4, MASK0 # mask for byte 0
|
||||
movi a7, MASK4
|
||||
addi a3, a3, -8
|
||||
addi a2, a2, -8
|
||||
and a5, a5, a2
|
||||
bne.w15 a8, a9, .Lretdiff
|
||||
l32i a8, a5, 8 # get word from aligned variant of s1
|
||||
|
||||
bany.w15 a11, a10, .Lnot_aligned
|
||||
|
||||
/* s1 is word-aligned; s2 is word-aligned.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop. */
|
||||
|
||||
/* New algorithm, relying on the fact that all normal ASCII is between
|
||||
32 and 127.
|
||||
|
||||
Rather than check all bytes for zero:
|
||||
Take one word (4 bytes). Call it w1.
|
||||
Shift w1 left by one into w1'.
|
||||
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
|
||||
Check that all 4 bit 6's (one for each byte) are one:
|
||||
If they are, we are definitely not done.
|
||||
If they are not, we are probably done, but need to check for zero. */
|
||||
|
||||
.Laligned:
|
||||
/* Loop forever */
|
||||
1:
|
||||
loop a0, .Laligned_done
|
||||
|
||||
/* First unrolled loop body. */
|
||||
l32i a9, a3, 8 # get word from s2
|
||||
addi a3, a3, 8 # advance s2 pointer
|
||||
slli a5, a8, 1
|
||||
or a10, a8, a5
|
||||
{
|
||||
bne.w15 a8, a9, .Lwne2
|
||||
l32i a11, a2, 12 # get word from s1+4
|
||||
nop
|
||||
nop
|
||||
}
|
||||
l32i a9, a3, 4 # get word from s2+4
|
||||
bnall.w15 a10, a7, .Lprobeq
|
||||
|
||||
/* Second unrolled loop body. */
|
||||
slli a5, a11, 1
|
||||
or a10, a11, a5
|
||||
addi a2, a2, 8 # advance s1 pointer
|
||||
mov a8, a11
|
||||
bne.w15 a11, a9, .Lwne2
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
bnall.w15 a10, a7, .Lprobeq2
|
||||
|
||||
.Laligned_done:
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
j 1b
|
||||
|
||||
.Lnot_aligned:
|
||||
xor a11, a2, a3 # compare low two bits of s1 and s2
|
||||
bany a11, a10, .Lunaligned # if they have different alignment
|
||||
|
||||
/* s1/s2 are not word-aligned. */
|
||||
movi a5, 0xfffffffc
|
||||
addi a2, a2, 1 # advance s1
|
||||
beqz a9, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
and a6, a2, a5
|
||||
l32i a8, a6, 8 # get word from s1
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 8 # byte 1 from s1
|
||||
l8ui a9, a3, 8 # byte 1 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
and a6, a2, a5
|
||||
l32i a8, a6, 8 # get word from s1
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 8 # byte 2 from s1
|
||||
l8ui a9, a3, 8 # byte 2 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
l32i a8, a2, 8 # get word from s1
|
||||
j .Laligned
|
||||
|
||||
/* s1 and s2 have different alignment.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop.
|
||||
|
||||
Note: It is important for this unaligned case to come before the
|
||||
code for aligned strings, because otherwise some of the branches
|
||||
above cannot reach and have to be transformed to branches around
|
||||
jumps. The unaligned code is smaller and the branches can reach
|
||||
over it. */
|
||||
|
||||
.Lunaligned:
|
||||
movi.n a8, 0 # set up for the maximum loop count
|
||||
loop a8, .Lretdiff # loop forever (almost anyway)
|
||||
l8ui a8, a2, 8
|
||||
l8ui a9, a3, 8
|
||||
addi a2, a2, 1
|
||||
bne a8, a9, .Lretdiff
|
||||
addi a3, a3, 1
|
||||
beqz a8, .Lretdiff
|
||||
.Lretdiff:
|
||||
sub a2, a8, a9
|
||||
RET(16)
|
||||
|
||||
|
||||
.Lprobeq2:
|
||||
/* Adjust pointers to account for the loop unrolling. */
|
||||
mov a8, a11
|
||||
addi a2, a2, -4
|
||||
addi a3, a3, 4
|
||||
|
||||
/* align (0 mod 4) */
|
||||
.Lprobeq:
|
||||
/* Words are probably equal, but check for sure.
|
||||
If not, loop over the rest of string using normal algorithm. */
|
||||
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
movi a7, MASK3 # mask for byte 3
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
/* align (1 mod 4) */
|
||||
addi.n a2, a2, 12 # advance s1 pointer
|
||||
addi.n a3, a3, 4 # advance s2 pointer
|
||||
/* align (1 mod 4) or (2 mod 4) */
|
||||
1:
|
||||
loop a0, .Lend # loop forever (a4 is bigger than max iters)
|
||||
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
bne a8, a9, .Lwne
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
.Lend:
|
||||
j 1b
|
||||
|
||||
/* Words are equal; some byte is zero. */
|
||||
.Leq: movi a2, 0 # return equal
|
||||
RET(16)
|
||||
|
||||
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
|
||||
bytes are zero, the return value can be determined by a simple
|
||||
comparison. */
|
||||
.Lwne: /* Words are not equal. */
|
||||
xor a2, a8, a9 # get word with nonzero in byte that differs
|
||||
extui a10, a8, 0, 8
|
||||
extui a11, a9, 0, 8
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
bany.w15 a2, a4, .Ldiff0 # if byte 0 differs
|
||||
|
||||
bnone.w15 a8, a4, .Leq # if byte 0 is zero
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bany.w15 a2, a5, .Ldiff1 # if byte 1 differs
|
||||
extui a10, a8, 24, 8
|
||||
bnone.w15 a8, a5, .Leq # if byte 1 is zero
|
||||
extui a11, a9, 24, 8
|
||||
bany.w15 a2, a6, .Ldiff2 # if byte 2 differs
|
||||
sub a2, a10, a11
|
||||
bnone.w15 a8, a6, .Leq # if byte 2 is zero
|
||||
/* Little-endian is a little more difficult because can't subtract
|
||||
whole words. */
|
||||
.Ldiff3:
|
||||
/* Bytes 0-2 are equal; byte 3 is different.
|
||||
For little-endian need to have a sign bit for the difference. */
|
||||
RET(16)
|
||||
.Ldiff0:
|
||||
/* Byte 0 is different. */
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff1:
|
||||
/* Byte 0 is equal; byte 1 is different. */
|
||||
extui a10, a8, 8, 8
|
||||
extui a11, a9, 8, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff2:
|
||||
/* Bytes 0-1 are equal; byte 2 is different. */
|
||||
extui a10, a8, 16, 8
|
||||
extui a11, a9, 16, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
|
||||
#else /* Not FLIX3 */
|
||||
ENTRY(16)
|
||||
/* a2 = s1, a3 = s2 */
|
||||
|
||||
l8ui a8, a2, 0 # byte 0 from s1
|
||||
l8ui a9, a3, 0 # byte 0 from s2
|
||||
movi a10, 3 # mask
|
||||
bne a8, a9, .Lretdiff
|
||||
|
||||
or a11, a2, a3
|
||||
bnone a11, a10, .Laligned
|
||||
|
||||
xor a11, a2, a3 # compare low two bits of s1 and s2
|
||||
bany a11, a10, .Lunaligned # if they have different alignment
|
||||
|
||||
/* s1/s2 are not word-aligned. */
|
||||
addi a2, a2, 1 # advance s1
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 0 # byte 1 from s1
|
||||
l8ui a9, a3, 0 # byte 1 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
bnone a2, a10, .Laligned # if s1/s2 now aligned
|
||||
l8ui a8, a2, 0 # byte 2 from s1
|
||||
l8ui a9, a3, 0 # byte 2 from s2
|
||||
addi a2, a2, 1 # advance s1
|
||||
bne a8, a9, .Lretdiff # if different, return difference
|
||||
beqz a8, .Leq # bytes equal, if zero, strings are equal
|
||||
addi a3, a3, 1 # advance s2
|
||||
j .Laligned
|
||||
|
||||
/* s1 and s2 have different alignment.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop.
|
||||
|
||||
Note: It is important for this unaligned case to come before the
|
||||
code for aligned strings, because otherwise some of the branches
|
||||
above cannot reach and have to be transformed to branches around
|
||||
jumps. The unaligned code is smaller and the branches can reach
|
||||
over it. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
#endif
|
||||
.Lunaligned:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, .Lretdiff # loop forever (almost anyway)
|
||||
#endif
|
||||
.Lnextbyte:
|
||||
l8ui a8, a2, 0
|
||||
l8ui a9, a3, 0
|
||||
addi a2, a2, 1
|
||||
bne a8, a9, .Lretdiff
|
||||
addi a3, a3, 1
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
beqz a8, .Lretdiff
|
||||
#else
|
||||
bnez a8, .Lnextbyte
|
||||
#endif
|
||||
.Lretdiff:
|
||||
sub a2, a8, a9
|
||||
RET(16)
|
||||
|
||||
/* s1 is word-aligned; s2 is word-aligned.
|
||||
|
||||
If the zero-overhead loop option is available, use an (almost)
|
||||
infinite zero-overhead loop with conditional exits so we only pay
|
||||
for taken branches when exiting the loop. */
|
||||
|
||||
/* New algorithm, relying on the fact that all normal ASCII is between
|
||||
32 and 127.
|
||||
|
||||
Rather than check all bytes for zero:
|
||||
Take one word (4 bytes). Call it w1.
|
||||
Shift w1 left by one into w1'.
|
||||
Or w1 and w1'. For all normal ASCII bit 6 will be 1; for zero it won't.
|
||||
Check that all 4 bit 6's (one for each byte) are one:
|
||||
If they are, we are definitely not done.
|
||||
If they are not, we are probably done, but need to check for zero. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_CONST16
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
#endif
|
||||
.Laligned:
|
||||
movi a4, MASK0 # mask for byte 0
|
||||
movi a7, MASK4
|
||||
|
||||
/* Loop forever */
|
||||
1:
|
||||
loop a0, .Laligned_done
|
||||
|
||||
/* First unrolled loop body. */
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
slli a5, a8, 1
|
||||
bne a8, a9, .Lwne2
|
||||
or a9, a8, a5
|
||||
bnall a9, a7, .Lprobeq
|
||||
|
||||
/* Second unrolled loop body. */
|
||||
l32i a8, a2, 4 # get word from s1+4
|
||||
l32i a9, a3, 4 # get word from s2+4
|
||||
slli a5, a8, 1
|
||||
bne a8, a9, .Lwne2
|
||||
or a9, a8, a5
|
||||
bnall a9, a7, .Lprobeq2
|
||||
|
||||
addi a2, a2, 8 # advance s1 pointer
|
||||
addi a3, a3, 8 # advance s2 pointer
|
||||
.Laligned_done:
|
||||
j 1b
|
||||
|
||||
.Lprobeq2:
|
||||
/* Adjust pointers to account for the loop unrolling. */
|
||||
addi a2, a2, 4
|
||||
addi a3, a3, 4
|
||||
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
.Laligned:
|
||||
movi a4, MASK0 # mask for byte 0
|
||||
movi a7, MASK4
|
||||
j .Lfirstword
|
||||
.Lnextword:
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
.Lfirstword:
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
slli a5, a8, 1
|
||||
bne a8, a9, .Lwne2
|
||||
or a9, a8, a5
|
||||
ball a9, a7, .Lnextword
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
/* align (0 mod 4) */
|
||||
.Lprobeq:
|
||||
/* Words are probably equal, but check for sure.
|
||||
If not, loop over the rest of string using normal algorithm. */
|
||||
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
movi a7, MASK3 # mask for byte 3
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
/* align (1 mod 4) */
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
addi.n a2, a2, 4 # advance s1 pointer
|
||||
addi.n a3, a3, 4 # advance s2 pointer
|
||||
/* align (1 mod 4) or (2 mod 4) */
|
||||
#else
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
or a1, a1, a1 # nop
|
||||
#if XCHAL_HAVE_CONST16
|
||||
or a1, a1, a1 # nop
|
||||
#endif
|
||||
/* align (2 mod 4) */
|
||||
#endif /* XCHAL_HAVE_DENSITY */
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
1:
|
||||
loop a0, .Leq # loop forever (a4 is bigger than max iters)
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
bne a8, a9, .Lwne
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bnone a8, a7, .Leq # if byte 3 is zero
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
j 1b
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
j .Lfirstword2
|
||||
.Lnextword2:
|
||||
addi a3, a3, 4 # advance s2 pointer
|
||||
.Lfirstword2:
|
||||
l32i a8, a2, 0 # get word from s1
|
||||
l32i a9, a3, 0 # get word from s2
|
||||
addi a2, a2, 4 # advance s1 pointer
|
||||
bne a8, a9, .Lwne
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
bany a8, a7, .Lnextword2 # if byte 3 is zero
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
/* Words are equal; some byte is zero. */
|
||||
.Leq: movi a2, 0 # return equal
|
||||
RET(16)
|
||||
|
||||
.Lwne2: /* Words are not equal. On big-endian processors, if none of the
|
||||
bytes are zero, the return value can be determined by a simple
|
||||
comparison. */
|
||||
#if XCHAL_HAVE_BE
|
||||
or a10, a8, a5
|
||||
bnall a10, a7, .Lsomezero
|
||||
bgeu a8, a9, .Lposreturn
|
||||
movi a2, -1
|
||||
RET(16)
|
||||
.Lposreturn:
|
||||
movi a2, 1
|
||||
RET(16)
|
||||
.Lsomezero: # There is probably some zero byte.
|
||||
#endif /* XCHAL_HAVE_BE */
|
||||
.Lwne: /* Words are not equal. */
|
||||
xor a2, a8, a9 # get word with nonzero in byte that differs
|
||||
bany a2, a4, .Ldiff0 # if byte 0 differs
|
||||
movi a5, MASK1 # mask for byte 1
|
||||
bnone a8, a4, .Leq # if byte 0 is zero
|
||||
bany a2, a5, .Ldiff1 # if byte 1 differs
|
||||
movi a6, MASK2 # mask for byte 2
|
||||
bnone a8, a5, .Leq # if byte 1 is zero
|
||||
bany a2, a6, .Ldiff2 # if byte 2 differs
|
||||
bnone a8, a6, .Leq # if byte 2 is zero
|
||||
#if XCHAL_HAVE_BE
|
||||
.Ldiff3:
|
||||
.Ldiff2:
|
||||
.Ldiff1:
|
||||
/* Byte 0 is equal (at least) and there is a difference before a zero
|
||||
byte. Just subtract words to get the return value.
|
||||
The high order equal bytes cancel, leaving room for the sign. */
|
||||
sub a2, a8, a9
|
||||
RET(16)
|
||||
|
||||
.Ldiff0:
|
||||
/* Need to make room for the sign, so can't subtract whole words. */
|
||||
extui a10, a8, 24, 8
|
||||
extui a11, a9, 24, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
#else /* !XCHAL_HAVE_BE */
|
||||
/* Little-endian is a little more difficult because can't subtract
|
||||
whole words. */
|
||||
.Ldiff3:
|
||||
/* Bytes 0-2 are equal; byte 3 is different.
|
||||
For little-endian need to have a sign bit for the difference. */
|
||||
extui a10, a8, 24, 8
|
||||
extui a11, a9, 24, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff0:
|
||||
/* Byte 0 is different. */
|
||||
extui a10, a8, 0, 8
|
||||
extui a11, a9, 0, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff1:
|
||||
/* Byte 0 is equal; byte 1 is different. */
|
||||
extui a10, a8, 8, 8
|
||||
extui a11, a9, 8, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
.Ldiff2:
|
||||
/* Bytes 0-1 are equal; byte 2 is different. */
|
||||
extui a10, a8, 16, 8
|
||||
extui a11, a9, 16, 8
|
||||
sub a2, a10, a11
|
||||
RET(16)
|
||||
|
||||
#endif /* !XCHAL_HAVE_BE */
|
||||
#endif /* FLIX3 */
|
||||
#endif /* FLIX3 */
|
||||
|
||||
.end schedule
|
||||
.size strcmp, . - strcmp
|
||||
|
243
libs/libc/machine/xtensa/arch_strcpy.S
Normal file
243
libs/libc/machine/xtensa/arch_strcpy.S
Normal file
@ -0,0 +1,243 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_strcpy.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.align 4
|
||||
.literal_position
|
||||
.global strcpy
|
||||
.type strcpy, @function
|
||||
strcpy:
|
||||
ENTRY(16)
|
||||
/* a2 = dst, a3 = src */
|
||||
|
||||
mov a10, a2 # leave dst in return value register
|
||||
movi a4, MASK0
|
||||
movi a5, MASK1
|
||||
movi a6, MASK2
|
||||
movi a7, MASK3
|
||||
bbsi.l a3, 0, .Lsrc1mod2
|
||||
bbsi.l a3, 1, .Lsrc2mod4
|
||||
.Lsrcaligned:
|
||||
|
||||
/* Check if the destination is aligned. */
|
||||
movi a8, 3
|
||||
bnone a10, a8, .Laligned
|
||||
|
||||
j .Ldstunaligned
|
||||
|
||||
.Lsrc1mod2: # src address is odd
|
||||
l8ui a8, a3, 0 # get byte 0
|
||||
addi a3, a3, 1 # advance src pointer
|
||||
s8i a8, a10, 0 # store byte 0
|
||||
beqz a8, 1f # if byte 0 is zero
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
|
||||
|
||||
.Lsrc2mod4: # src address is 2 mod 4
|
||||
l8ui a8, a3, 0 # get byte 0
|
||||
/* 1-cycle interlock */
|
||||
s8i a8, a10, 0 # store byte 0
|
||||
beqz a8, 1f # if byte 0 is zero
|
||||
l8ui a8, a3, 1 # get byte 0
|
||||
addi a3, a3, 2 # advance src pointer
|
||||
s8i a8, a10, 1 # store byte 0
|
||||
addi a10, a10, 2 # advance dst pointer
|
||||
bnez a8, .Lsrcaligned
|
||||
1: RET(16)
|
||||
|
||||
|
||||
/* dst is word-aligned; src is word-aligned. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
.Laligned:
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, .Lz3 # loop forever (almost anyway)
|
||||
l32i a8, a3, 0 # get word from src
|
||||
addi a3, a3, 4 # advance src pointer
|
||||
bnone a8, a4, .Lz0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lz1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lz2 # if byte 2 is zero
|
||||
s32i a8, a10, 0 # store word to dst
|
||||
bnone a8, a7, .Lz3 # if byte 3 is zero
|
||||
addi a10, a10, 4 # advance dst pointer
|
||||
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
1: addi a10, a10, 4 # advance dst pointer
|
||||
.Laligned:
|
||||
l32i a8, a3, 0 # get word from src
|
||||
addi a3, a3, 4 # advance src pointer
|
||||
bnone a8, a4, .Lz0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lz1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lz2 # if byte 2 is zero
|
||||
s32i a8, a10, 0 # store word to dst
|
||||
bany a8, a7, 1b # if byte 3 is zero
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
.Lz3: /* Byte 3 is zero. */
|
||||
RET(16)
|
||||
|
||||
.Lz0: /* Byte 0 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
movi a8, 0
|
||||
#endif
|
||||
s8i a8, a10, 0
|
||||
RET(16)
|
||||
|
||||
.Lz1: /* Byte 1 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
extui a8, a8, 16, 16
|
||||
#endif
|
||||
s16i a8, a10, 0
|
||||
RET(16)
|
||||
|
||||
.Lz2: /* Byte 2 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
extui a8, a8, 16, 16
|
||||
#endif
|
||||
s16i a8, a10, 0
|
||||
movi a8, 0
|
||||
s8i a8, a10, 2
|
||||
RET(16)
|
||||
|
||||
#if 1
|
||||
/* For now just use byte copy loop for the unaligned destination case. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
#endif
|
||||
.Ldstunaligned:
|
||||
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, 2f # loop forever (almost anyway)
|
||||
#endif
|
||||
1: l8ui a8, a3, 0
|
||||
addi a3, a3, 1
|
||||
s8i a8, a10, 0
|
||||
addi a10, a10, 1
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
beqz a8, 2f
|
||||
#else
|
||||
bnez a8, 1b
|
||||
#endif
|
||||
2: RET(16)
|
||||
|
||||
#else /* 0 */
|
||||
|
||||
/* This code is not functional yet. */
|
||||
|
||||
.Ldstunaligned:
|
||||
l32i a9, a2, 0 # load word from dst
|
||||
#if XCHAL_HAVE_BE
|
||||
ssa8b a9 # rotate by dst alignment so that
|
||||
src a9, a9, a9 # shift in loop will put back in place
|
||||
ssa8l a9 # shift left by byte*8
|
||||
#else
|
||||
ssa8l a9 # rotate by dst alignment so that
|
||||
src a9, a9, a9 # shift in loop will put back in place
|
||||
ssa8b a9 # shift left by 32-byte*8
|
||||
#endif
|
||||
|
||||
/* dst is word-aligned; src is unaligned. */
|
||||
|
||||
.Ldstunalignedloop:
|
||||
l32i a8, a3, 0 # get word from src
|
||||
/* 1-cycle interlock */
|
||||
bnone a8, a4, .Lu0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lu1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lu2 # if byte 2 is zero
|
||||
src a9, a8, a9 # combine last word and this word
|
||||
s32i a9, a10, 0 # store word to dst
|
||||
bnone a8, a7, .Lu3 # if byte 3 is nonzero, iterate
|
||||
l32i a9, a3, 4 # get word from src
|
||||
addi a3, a3, 8 # advance src pointer
|
||||
bnone a9, a4, .Lu4 # if byte 0 is zero
|
||||
bnone a9, a5, .Lu5 # if byte 1 is zero
|
||||
bnone a9, a6, .Lu6 # if byte 2 is zero
|
||||
src a8, a9, a8 # combine last word and this word
|
||||
s32i a8, a10, 4 # store word to dst
|
||||
addi a10, a10, 8 # advance dst pointer
|
||||
bany a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
|
||||
|
||||
/* Byte 7 is zero. */
|
||||
.Lu7: RET(16)
|
||||
|
||||
.Lu0: /* Byte 0 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
movi a8, 0
|
||||
#endif
|
||||
s8i a8, a10, 0
|
||||
RET(16)
|
||||
|
||||
.Lu1: /* Byte 1 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
extui a8, a8, 16, 16
|
||||
#endif
|
||||
s16i a8, a10, 0
|
||||
RET(16)
|
||||
|
||||
.Lu2: /* Byte 2 is zero. */
|
||||
s16i a8, a10, 0
|
||||
movi a8, 0
|
||||
s8i a8, a10, 2
|
||||
RET(16)
|
||||
|
||||
#endif /* 0 */
|
||||
.end schedule
|
||||
|
||||
.size strcpy, . - strcpy
|
123
libs/libc/machine/xtensa/arch_strlen.S
Normal file
123
libs/libc/machine/xtensa/arch_strlen.S
Normal file
@ -0,0 +1,123 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_strlen.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.align 4
|
||||
.literal_position
|
||||
.global strlen
|
||||
.type strlen, @function
|
||||
strlen:
|
||||
ENTRY(16)
|
||||
/* a2 = s */
|
||||
|
||||
addi a3, a2, -4 # because we overincrement at the end
|
||||
movi a4, MASK0
|
||||
movi a5, MASK1
|
||||
movi a6, MASK2
|
||||
movi a7, MASK3
|
||||
bbsi.l a2, 0, .L1mod2
|
||||
bbsi.l a2, 1, .L2mod4
|
||||
j .Laligned
|
||||
|
||||
.L1mod2: # address is odd
|
||||
l8ui a8, a3, 4 # get byte 0
|
||||
addi a3, a3, 1 # advance string pointer
|
||||
beqz a8, .Lz3 # if byte 0 is zero
|
||||
bbci.l a3, 1, .Laligned # if string pointer is now word-aligned
|
||||
|
||||
.L2mod4: # address is 2 mod 4
|
||||
addi a3, a3, 2 # advance ptr for aligned access
|
||||
l32i a8, a3, 0 # get word with first two bytes of string
|
||||
bnone a8, a6, .Lz2 # if byte 2 (of word, not string) is zero
|
||||
bany a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
|
||||
|
||||
/* Byte 3 is zero. */
|
||||
addi a3, a3, 3 # point to zero byte
|
||||
sub a2, a3, a2 # subtract to get length
|
||||
RET(16)
|
||||
|
||||
|
||||
/* String is word-aligned. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
#endif
|
||||
.Laligned:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, .Lz3 # loop forever (almost anyway)
|
||||
#endif
|
||||
1: l32i a8, a3, 4 # get next word of string
|
||||
addi a3, a3, 4 # advance string pointer
|
||||
bnone a8, a4, .Lz0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lz1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lz2 # if byte 2 is zero
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
bnone a8, a7, .Lz3 # if byte 3 is zero
|
||||
#else
|
||||
bany a8, a7, 1b # repeat if byte 3 is non-zero
|
||||
#endif
|
||||
|
||||
.Lz3: /* Byte 3 is zero. */
|
||||
addi a3, a3, 3 # point to zero byte
|
||||
/* Fall through.... */
|
||||
|
||||
.Lz0: /* Byte 0 is zero. */
|
||||
sub a2, a3, a2 # subtract to get length
|
||||
RET(16)
|
||||
|
||||
.Lz1: /* Byte 1 is zero. */
|
||||
addi a3, a3, 1 # point to zero byte
|
||||
sub a2, a3, a2 # subtract to get length
|
||||
RET(16)
|
||||
|
||||
.Lz2: /* Byte 2 is zero. */
|
||||
addi a3, a3, 2 # point to zero byte
|
||||
sub a2, a3, a2 # subtract to get length
|
||||
RET(16)
|
||||
|
||||
.end schedule
|
||||
|
||||
.size strlen, . - strlen
|
265
libs/libc/machine/xtensa/arch_strncpy.S
Normal file
265
libs/libc/machine/xtensa/arch_strncpy.S
Normal file
@ -0,0 +1,265 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/arch_strncpy.S
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include "xtensa_asm.h"
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
#include <arch/xtensa/xtensa_abi.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Public Functions
|
||||
****************************************************************************/
|
||||
|
||||
.section .text
|
||||
.begin schedule
|
||||
.align 4
|
||||
.literal_position
|
||||
__strncpy_aux:
|
||||
|
||||
.Lsrc1mod2: # src address is odd
|
||||
l8ui a8, a3, 0 # get byte 0
|
||||
addi a3, a3, 1 # advance src pointer
|
||||
s8i a8, a10, 0 # store byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
beqz a4, .Lret # if n is zero
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
beqz a8, .Lfill # if byte 0 is zero
|
||||
bbci.l a3, 1, .Lsrcaligned # if src is now word-aligned
|
||||
|
||||
.Lsrc2mod4: # src address is 2 mod 4
|
||||
l8ui a8, a3, 0 # get byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
s8i a8, a10, 0 # store byte 0
|
||||
beqz a4, .Lret # if n is zero
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
beqz a8, .Lfill # if byte 0 is zero
|
||||
l8ui a8, a3, 1 # get byte 0
|
||||
addi a3, a3, 2 # advance src pointer
|
||||
s8i a8, a10, 0 # store byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
beqz a4, .Lret # if n is zero
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
bnez a8, .Lsrcaligned
|
||||
j .Lfill
|
||||
|
||||
.Lret:
|
||||
RET(16)
|
||||
|
||||
.align 4
|
||||
.global strncpy
|
||||
.type strncpy, @function
|
||||
strncpy:
|
||||
ENTRY(16)
|
||||
/* a2 = dst, a3 = src */
|
||||
|
||||
mov a10, a2 # leave dst in return value register
|
||||
beqz a4, .Lret # if n is zero
|
||||
|
||||
movi a11, MASK0
|
||||
movi a5, MASK1
|
||||
movi a6, MASK2
|
||||
movi a7, MASK3
|
||||
bbsi.l a3, 0, .Lsrc1mod2
|
||||
bbsi.l a3, 1, .Lsrc2mod4
|
||||
.Lsrcaligned:
|
||||
|
||||
/* Check if the destination is aligned. */
|
||||
movi a8, 3
|
||||
bnone a10, a8, .Laligned
|
||||
|
||||
j .Ldstunaligned
|
||||
|
||||
|
||||
/* Fill the dst with zeros -- n is at least 1. */
|
||||
|
||||
.Lfill:
|
||||
movi a9, 0
|
||||
bbsi.l a10, 0, .Lfill1mod2
|
||||
bbsi.l a10, 1, .Lfill2mod4
|
||||
.Lfillaligned:
|
||||
blti a4, 4, .Lfillcleanup
|
||||
|
||||
/* Loop filling complete words with zero. */
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
|
||||
srai a8, a4, 2
|
||||
loop a8, 1f
|
||||
s32i a9, a10, 0
|
||||
addi a10, a10, 4
|
||||
|
||||
1: slli a8, a8, 2
|
||||
sub a4, a4, a8
|
||||
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
1: s32i a9, a10, 0
|
||||
addi a10, a10, 4
|
||||
addi a4, a4, -4
|
||||
bgei a4, 4, 1b
|
||||
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
beqz a4, 2f
|
||||
|
||||
.Lfillcleanup:
|
||||
/* Fill leftover (1 to 3) bytes with zero. */
|
||||
s8i a9, a10, 0 # store byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
addi a10, a10, 1
|
||||
bnez a4, .Lfillcleanup
|
||||
|
||||
2: RET(16)
|
||||
|
||||
.Lfill1mod2: # dst address is odd
|
||||
s8i a9, a10, 0 # store byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
beqz a4, 2b # if n is zero
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
bbci.l a10, 1, .Lfillaligned # if dst is now word-aligned
|
||||
|
||||
.Lfill2mod4: # dst address is 2 mod 4
|
||||
s8i a9, a10, 0 # store byte 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
beqz a4, 2b # if n is zero
|
||||
s8i a9, a10, 1 # store byte 1
|
||||
addi a4, a4, -1 # decrement n
|
||||
beqz a4, 2b # if n is zero
|
||||
addi a10, a10, 2 # advance dst pointer
|
||||
j .Lfillaligned
|
||||
|
||||
|
||||
/* dst is word-aligned; src is word-aligned; n is at least 1. */
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
#endif
|
||||
.Laligned:
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, 1f # loop forever (almost anyway)
|
||||
blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
|
||||
l32i a8, a3, 0 # get word from src
|
||||
addi a3, a3, 4 # advance src pointer
|
||||
bnone a8, a11, .Lz0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lz1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lz2 # if byte 2 is zero
|
||||
s32i a8, a10, 0 # store word to dst
|
||||
addi a4, a4, -4 # decrement n
|
||||
addi a10, a10, 4 # advance dst pointer
|
||||
bnone a8, a7, .Lfill # if byte 3 is zero
|
||||
1:
|
||||
|
||||
#else /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
1: blti a4, 5, .Ldstunaligned # n is near limit; do one at a time
|
||||
l32i a8, a3, 0 # get word from src
|
||||
addi a3, a3, 4 # advance src pointer
|
||||
bnone a8, a11, .Lz0 # if byte 0 is zero
|
||||
bnone a8, a5, .Lz1 # if byte 1 is zero
|
||||
bnone a8, a6, .Lz2 # if byte 2 is zero
|
||||
s32i a8, a10, 0 # store word to dst
|
||||
addi a4, a4, -4 # decrement n
|
||||
addi a10, a10, 4 # advance dst pointer
|
||||
bany a8, a7, 1b # no zeroes
|
||||
#endif /* !XCHAL_HAVE_LOOPS */
|
||||
|
||||
j .Lfill
|
||||
|
||||
.Lz0: /* Byte 0 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
movi a8, 0
|
||||
#endif
|
||||
s8i a8, a10, 0
|
||||
addi a4, a4, -1 # decrement n
|
||||
addi a10, a10, 1 # advance dst pointer
|
||||
j .Lfill
|
||||
|
||||
.Lz1: /* Byte 1 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
extui a8, a8, 16, 16
|
||||
#endif
|
||||
s16i a8, a10, 0
|
||||
addi a4, a4, -2 # decrement n
|
||||
addi a10, a10, 2 # advance dst pointer
|
||||
j .Lfill
|
||||
|
||||
.Lz2: /* Byte 2 is zero. */
|
||||
#if XCHAL_HAVE_BE
|
||||
extui a8, a8, 16, 16
|
||||
#endif
|
||||
s16i a8, a10, 0
|
||||
movi a8, 0
|
||||
s8i a8, a10, 2
|
||||
addi a4, a4, -3 # decrement n
|
||||
addi a10, a10, 3 # advance dst pointer
|
||||
j .Lfill
|
||||
|
||||
.align 4
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
/* (2 mod 4) alignment for loop instruction */
|
||||
#else
|
||||
/* (1 mod 4) alignment for loop instruction */
|
||||
.byte 0
|
||||
.byte 0
|
||||
#endif
|
||||
#endif
|
||||
.Ldstunaligned:
|
||||
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
#if XCHAL_HAVE_DENSITY
|
||||
_movi.n a8, 0 # set up for the maximum loop count
|
||||
#else
|
||||
_movi a8, 0 # set up for the maximum loop count
|
||||
#endif
|
||||
loop a8, 2f # loop forever (almost anyway)
|
||||
#endif
|
||||
1: l8ui a8, a3, 0
|
||||
addi a3, a3, 1
|
||||
s8i a8, a10, 0
|
||||
addi a4, a4, -1
|
||||
beqz a4, 3f
|
||||
addi a10, a10, 1
|
||||
#if XCHAL_HAVE_LOOPS
|
||||
beqz a8, 2f
|
||||
#else
|
||||
bnez a8, 1b
|
||||
#endif
|
||||
2: j .Lfill
|
||||
|
||||
3: RET(16)
|
||||
.end schedule
|
||||
|
||||
.size strncpy, . - strncpy
|
62
libs/libc/machine/xtensa/xtensa_asm.h
Normal file
62
libs/libc/machine/xtensa/xtensa_asm.h
Normal file
@ -0,0 +1,62 @@
|
||||
/****************************************************************************
|
||||
* libs/libc/machine/xtensa/xtensa_asm.h
|
||||
*
|
||||
* Licensed to the Apache Software Foundation (ASF) under one or more
|
||||
* contributor license agreements. See the NOTICE file distributed with
|
||||
* this work for additional information regarding copyright ownership. The
|
||||
* ASF licenses this file to you under the Apache License, Version 2.0 (the
|
||||
* "License"); you may not use this file except in compliance with the
|
||||
* License. You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
|
||||
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
|
||||
* License for the specific language governing permissions and limitations
|
||||
* under the License.
|
||||
*
|
||||
****************************************************************************/
|
||||
|
||||
/****************************************************************************
|
||||
* Included Files
|
||||
****************************************************************************/
|
||||
|
||||
#include <arch/chip/core-isa.h>
|
||||
|
||||
/****************************************************************************
|
||||
* Assembly Language Macros
|
||||
****************************************************************************/
|
||||
|
||||
.macro src_b r, w0, w1
|
||||
#if XCHAL_HAVE_BE
|
||||
src \r, \w0, \w1
|
||||
#else
|
||||
src \r, \w1, \w0
|
||||
#endif
|
||||
.endm
|
||||
|
||||
.macro ssa8 r
|
||||
#if XCHAL_HAVE_BE
|
||||
ssa8b \r
|
||||
#else
|
||||
ssa8l \r
|
||||
#endif
|
||||
.endm
|
||||
|
||||
/****************************************************************************
|
||||
* Pre-processor Macros
|
||||
****************************************************************************/
|
||||
|
||||
#if XCHAL_HAVE_BE
|
||||
# define MASK0 0xff000000
|
||||
# define MASK1 0x00ff0000
|
||||
# define MASK2 0x0000ff00
|
||||
# define MASK3 0x000000ff
|
||||
#else
|
||||
# define MASK0 0x000000ff
|
||||
# define MASK1 0x0000ff00
|
||||
# define MASK2 0x00ff0000
|
||||
# define MASK3 0xff000000
|
||||
#endif
|
||||
|
Loading…
Reference in New Issue
Block a user