nuttx/libs/libc/machine/xtensa/arch_memcpy.S

/****************************************************************************
 * libs/libc/machine/xtensa/arch_memcpy.S
 *
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.  The
 * ASF licenses this file to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance with the
 * License.  You may obtain a copy of the License at
 *
 *   http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
 * License for the specific language governing permissions and limitations
 * under the License.
 *
 ****************************************************************************/

/****************************************************************************
 * Included Files
 ****************************************************************************/

#include "xtensa_asm.h"

#include <arch/chip/core-isa.h>
#include <arch/xtensa/xtensa_abi.h>

#include "libc.h"

#ifdef LIBC_BUILD_MEMCPY

/****************************************************************************
 * Pre-processor Macros
 ****************************************************************************/

/* set to 1 when running on ISS (simulator) with the
   lint or ferret client, or 0 to save a few cycles */

#define SIM_CHECKS_ALIGNMENT  0

/****************************************************************************
 * Public Functions
 ****************************************************************************/

  .section .text
  .begin schedule
  .literal_position

  .local  .Ldst1mod2
  .local  .Ldst2mod4
  .local  .Lbytecopy

  .align  4
  .global memcpy
  .type memcpy, @function
memcpy:
  ENTRY(16)
  /* a2 = dst, a3 = src, a4 = len */

  mov a5, a2    # copy dst so that a2 is return value
  bbsi.l  a2, 0, .Ldst1mod2
  bbsi.l  a2, 1, .Ldst2mod4
.Ldstaligned:

  /* Get number of loop iterations with 16B per iteration.  */
  srli  a7, a4, 4

  /* Check if source is aligned.  */
  slli  a8, a3, 30
  bnez  a8, .Lsrcunaligned

  /* Destination and source are word-aligned, use word copy.  */
#if XCHAL_HAVE_LOOPS
  loopnez a7, 2f
#else
  beqz  a7, 2f
  slli  a8, a7, 4
  add a8, a8, a3  # a8 = end of last 16B source chunk
#endif
1:  l32i  a6, a3, 0
  l32i  a7, a3, 4
  s32i  a6, a5, 0
  l32i  a6, a3, 8

  s32i  a7, a5, 4
  l32i  a7, a3, 12
  s32i  a6, a5, 8
  addi  a3, a3, 16
  s32i  a7, a5, 12
  addi  a5, a5, 16
#if !XCHAL_HAVE_LOOPS
  bltu  a3, a8, 1b
#endif

  /* Copy any leftover pieces smaller than 16B.  */
2:  bbci.l  a4, 3, 3f

  /* Copy 8 bytes.  */
  l32i  a6, a3, 0
  l32i  a7, a3, 4
  addi  a3, a3, 8
  s32i  a6, a5, 0
  s32i  a7, a5, 4
  addi  a5, a5, 8

3:  bbsi.l  a4, 2, 4f
  bbsi.l  a4, 1, 5f
  bbsi.l  a4, 0, 6f
  RET(16)

  # .align 4
  /* Copy 4 bytes.  */
4:  l32i  a6, a3, 0
  addi  a3, a3, 4
  s32i  a6, a5, 0
  addi  a5, a5, 4
  bbsi.l  a4, 1, 5f
  bbsi.l  a4, 0, 6f
  RET(16)

  /* Copy 2 bytes.  */
5:  l16ui a6, a3, 0
  addi  a3, a3, 2
  s16i  a6, a5, 0
  addi  a5, a5, 2
  bbsi.l  a4, 0, 6f
  RET(16)

  /* Copy 1 byte.  */
6:  l8ui  a6, a3, 0
  s8i a6, a5, 0

.Ldone:
  RET(16)

/* Destination is aligned; source is unaligned.  */

  # .align 4
.Lsrcunaligned:
  /* Avoid loading anything for zero-length copies.  */
  beqz  a4, .Ldone

  /* Copy 16 bytes per iteration for word-aligned dst and
     unaligned src.  */
  ssa8  a3    # set shift amount from byte offset
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  srli    a11, a8, 30     # save unalignment offset for below
  sub a3, a3, a11 # align a3
#endif
  l32i  a6, a3, 0 # load first word
#if XCHAL_HAVE_LOOPS
  loopnez a7, 2f
#else
  beqz  a7, 2f
  slli  a10, a7, 4
  add a10, a10, a3  # a10 = end of last 16B source chunk
#endif
1:  l32i  a7, a3, 4
  l32i  a8, a3, 8
  src_b a6, a6, a7
  s32i  a6, a5, 0
  l32i  a9, a3, 12
  src_b a7, a7, a8
  s32i  a7, a5, 4
  l32i  a6, a3, 16
  src_b a8, a8, a9
  s32i  a8, a5, 8
  addi  a3, a3, 16
  src_b a9, a9, a6
  s32i  a9, a5, 12
  addi  a5, a5, 16
#if !XCHAL_HAVE_LOOPS
  bltu  a3, a10, 1b
#endif

2:  bbci.l  a4, 3, 3f

  /* Copy 8 bytes.  */
  l32i  a7, a3, 4
  l32i  a8, a3, 8
  src_b a6, a6, a7
  s32i  a6, a5, 0
  addi  a3, a3, 8
  src_b a7, a7, a8
  s32i  a7, a5, 4
  addi  a5, a5, 8
  mov a6, a8

3:  bbci.l  a4, 2, 4f

  /* Copy 4 bytes.  */
  l32i  a7, a3, 4
  addi  a3, a3, 4
  src_b a6, a6, a7
  s32i  a6, a5, 0
  addi  a5, a5, 4
  mov a6, a7
4:
#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
  add a3, a3, a11 # readjust a3 with correct misalignment
#endif
  bbsi.l  a4, 1, 5f
  bbsi.l  a4, 0, 6f
  RET(16)

  /* Copy 2 bytes.  */
5:  l8ui  a6, a3, 0
  l8ui  a7, a3, 1
  addi  a3, a3, 2
  s8i a6, a5, 0
  s8i a7, a5, 1
  addi  a5, a5, 2
  bbsi.l  a4, 0, 6f
  RET(16)

  /* Copy 1 byte.  */
6:  l8ui  a6, a3, 0
  s8i a6, a5, 0
  RET(16)

  # .align XCHAL_INST_FETCH_WIDTH
__memcpy_aux:

  /* Skip bytes to get proper alignment for three-byte loop */
# .skip XCHAL_INST_FETCH_WIDTH - 3

.Lbytecopy:
#if XCHAL_HAVE_LOOPS
  loopnez a4, 2f
#else
  beqz  a4, 2f
  add a7, a3, a4  # a7 = end address for source
#endif
1:  l8ui  a6, a3, 0
  addi  a3, a3, 1
  s8i a6, a5, 0
  addi  a5, a5, 1
#if !XCHAL_HAVE_LOOPS
  bltu  a3, a7, 1b
#endif
2:  RET(16)

/* Destination is unaligned.  */

  # .align 4
.Ldst1mod2: # dst is only byte aligned

  /* Do short copies byte-by-byte.  */
  bltui a4, 7, .Lbytecopy

  /* Copy 1 byte.  */
  l8ui  a6, a3, 0
  addi  a3, a3, 1
  addi  a4, a4, -1
  s8i a6, a5, 0
  addi  a5, a5, 1

  /* Return to main algorithm if dst is now aligned.  */
  bbci.l  a5, 1, .Ldstaligned

.Ldst2mod4: # dst has 16-bit alignment

  /* Do short copies byte-by-byte.  */
  bltui a4, 6, .Lbytecopy

  /* Copy 2 bytes.  */
  l8ui  a6, a3, 0
  l8ui  a7, a3, 1
  addi  a3, a3, 2
  addi  a4, a4, -2
  s8i a6, a5, 0
  s8i a7, a5, 1
  addi  a5, a5, 2

  /* dst is now aligned; return to main algorithm.  */
  j .Ldstaligned

  .end schedule

  .size memcpy, . - memcpy

#endif
libc:machine:xtensa:add xtensa libc implement N/A Signed-off-by: zhuyanlin <zhuyanlin1@xiaomi.com> 2021-10-28 05:56:18 +02:00			`/****************************************************************************`
			`* libs/libc/machine/xtensa/arch_memcpy.S`
			`*`
			`* Licensed to the Apache Software Foundation (ASF) under one or more`
			`* contributor license agreements. See the NOTICE file distributed with`
			`* this work for additional information regarding copyright ownership. The`
			`* ASF licenses this file to you under the Apache License, Version 2.0 (the`
			`* "License"); you may not use this file except in compliance with the`
			`* License. You may obtain a copy of the License at`
			`*`
			`* http://www.apache.org/licenses/LICENSE-2.0`
			`*`
			`* Unless required by applicable law or agreed to in writing, software`
			`* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT`
			`* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the`
			`* License for the specific language governing permissions and limitations`
			`* under the License.`
			`*`
			`****************************************************************************/`

			`/****************************************************************************`
			`* Included Files`
			`****************************************************************************/`

			`#include "xtensa_asm.h"`

			`#include <arch/chip/core-isa.h>`
			`#include <arch/xtensa/xtensa_abi.h>`

libc/string: prevent libc in the kernel/userspace optionally Add the `LIBC_PREVENT_STRING_KERNEL` and `LIBC_PREVENT_STRING_USER` that are meant to be selected by the chip if no libc implementation is going to be built. If selected, neither NuttX's software version of the libc nor any architecture-specific implementation will be built in the kernel or in the userspace, respectively. In this case, the linker may provide a ROM-defined version of the libc functions instead. 2023-04-28 18:55:23 +02:00			`#include "libc.h"`

libs/libc: Breakdown LIBC_BUILD_STRING into specific string operations macro. Provide a way to only customize specific string operations, such as for memcpy with the DMA capability by ROM. Signed-off-by: yangdongdong <yangdongdong@xiaomi.com> 2023-08-25 16:38:35 +02:00			`#ifdef LIBC_BUILD_MEMCPY`
libc/string: prevent libc in the kernel/userspace optionally Add the `LIBC_PREVENT_STRING_KERNEL` and `LIBC_PREVENT_STRING_USER` that are meant to be selected by the chip if no libc implementation is going to be built. If selected, neither NuttX's software version of the libc nor any architecture-specific implementation will be built in the kernel or in the userspace, respectively. In this case, the linker may provide a ROM-defined version of the libc functions instead. 2023-04-28 18:55:23 +02:00
libc:machine:xtensa:add xtensa libc implement N/A Signed-off-by: zhuyanlin <zhuyanlin1@xiaomi.com> 2021-10-28 05:56:18 +02:00			`/****************************************************************************`
			`* Pre-processor Macros`
			`****************************************************************************/`

			`/* set to 1 when running on ISS (simulator) with the`
			`lint or ferret client, or 0 to save a few cycles */`

			`#define SIM_CHECKS_ALIGNMENT 0`

			`/****************************************************************************`
			`* Public Functions`
			`****************************************************************************/`

			`.section .text`
			`.begin schedule`
			`.literal_position`

			`.local .Ldst1mod2`
			`.local .Ldst2mod4`
			`.local .Lbytecopy`

			`.align 4`
			`.global memcpy`
			`.type memcpy, @function`
			`memcpy:`
			`ENTRY(16)`
			`/* a2 = dst, a3 = src, a4 = len */`

			`mov a5, a2 # copy dst so that a2 is return value`
			`bbsi.l a2, 0, .Ldst1mod2`
			`bbsi.l a2, 1, .Ldst2mod4`
			`.Ldstaligned:`

			`/* Get number of loop iterations with 16B per iteration. */`
			`srli a7, a4, 4`

			`/* Check if source is aligned. */`
			`slli a8, a3, 30`
			`bnez a8, .Lsrcunaligned`

			`/* Destination and source are word-aligned, use word copy. */`
			`#if XCHAL_HAVE_LOOPS`
			`loopnez a7, 2f`
			`#else`
			`beqz a7, 2f`
			`slli a8, a7, 4`
			`add a8, a8, a3 # a8 = end of last 16B source chunk`
			`#endif`
			`1: l32i a6, a3, 0`
			`l32i a7, a3, 4`
			`s32i a6, a5, 0`
			`l32i a6, a3, 8`

			`s32i a7, a5, 4`
			`l32i a7, a3, 12`
			`s32i a6, a5, 8`
			`addi a3, a3, 16`
			`s32i a7, a5, 12`
			`addi a5, a5, 16`
			`#if !XCHAL_HAVE_LOOPS`
			`bltu a3, a8, 1b`
			`#endif`

			`/* Copy any leftover pieces smaller than 16B. */`
			`2: bbci.l a4, 3, 3f`

			`/* Copy 8 bytes. */`
			`l32i a6, a3, 0`
			`l32i a7, a3, 4`
			`addi a3, a3, 8`
			`s32i a6, a5, 0`
			`s32i a7, a5, 4`
			`addi a5, a5, 8`

			`3: bbsi.l a4, 2, 4f`
			`bbsi.l a4, 1, 5f`
			`bbsi.l a4, 0, 6f`
			`RET(16)`

			`# .align 4`
			`/* Copy 4 bytes. */`
			`4: l32i a6, a3, 0`
			`addi a3, a3, 4`
			`s32i a6, a5, 0`
			`addi a5, a5, 4`
			`bbsi.l a4, 1, 5f`
			`bbsi.l a4, 0, 6f`
			`RET(16)`

			`/* Copy 2 bytes. */`
			`5: l16ui a6, a3, 0`
			`addi a3, a3, 2`
			`s16i a6, a5, 0`
			`addi a5, a5, 2`
			`bbsi.l a4, 0, 6f`
			`RET(16)`

			`/* Copy 1 byte. */`
			`6: l8ui a6, a3, 0`
			`s8i a6, a5, 0`

			`.Ldone:`
			`RET(16)`

			`/* Destination is aligned; source is unaligned. */`

			`# .align 4`
			`.Lsrcunaligned:`
			`/* Avoid loading anything for zero-length copies. */`
			`beqz a4, .Ldone`

			`/* Copy 16 bytes per iteration for word-aligned dst and`
			`unaligned src. */`
			`ssa8 a3 # set shift amount from byte offset`
			`#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT`
			`srli a11, a8, 30 # save unalignment offset for below`
			`sub a3, a3, a11 # align a3`
			`#endif`
			`l32i a6, a3, 0 # load first word`
			`#if XCHAL_HAVE_LOOPS`
			`loopnez a7, 2f`
			`#else`
			`beqz a7, 2f`
			`slli a10, a7, 4`
			`add a10, a10, a3 # a10 = end of last 16B source chunk`
			`#endif`
			`1: l32i a7, a3, 4`
			`l32i a8, a3, 8`
			`src_b a6, a6, a7`
			`s32i a6, a5, 0`
			`l32i a9, a3, 12`
			`src_b a7, a7, a8`
			`s32i a7, a5, 4`
			`l32i a6, a3, 16`
			`src_b a8, a8, a9`
			`s32i a8, a5, 8`
			`addi a3, a3, 16`
			`src_b a9, a9, a6`
			`s32i a9, a5, 12`
			`addi a5, a5, 16`
			`#if !XCHAL_HAVE_LOOPS`
			`bltu a3, a10, 1b`
			`#endif`

			`2: bbci.l a4, 3, 3f`

			`/* Copy 8 bytes. */`
			`l32i a7, a3, 4`
			`l32i a8, a3, 8`
			`src_b a6, a6, a7`
			`s32i a6, a5, 0`
			`addi a3, a3, 8`
			`src_b a7, a7, a8`
			`s32i a7, a5, 4`
			`addi a5, a5, 8`
			`mov a6, a8`

			`3: bbci.l a4, 2, 4f`

			`/* Copy 4 bytes. */`
			`l32i a7, a3, 4`
			`addi a3, a3, 4`
			`src_b a6, a6, a7`
			`s32i a6, a5, 0`
			`addi a5, a5, 4`
			`mov a6, a7`
			`4:`
			`#if XCHAL_UNALIGNED_LOAD_EXCEPTION \|\| SIM_CHECKS_ALIGNMENT`
			`add a3, a3, a11 # readjust a3 with correct misalignment`
			`#endif`
			`bbsi.l a4, 1, 5f`
			`bbsi.l a4, 0, 6f`
			`RET(16)`

			`/* Copy 2 bytes. */`
			`5: l8ui a6, a3, 0`
			`l8ui a7, a3, 1`
			`addi a3, a3, 2`
			`s8i a6, a5, 0`
			`s8i a7, a5, 1`
			`addi a5, a5, 2`
			`bbsi.l a4, 0, 6f`
			`RET(16)`

			`/* Copy 1 byte. */`
			`6: l8ui a6, a3, 0`
			`s8i a6, a5, 0`
			`RET(16)`

			`# .align XCHAL_INST_FETCH_WIDTH`
			`__memcpy_aux:`

			`/* Skip bytes to get proper alignment for three-byte loop */`
			`# .skip XCHAL_INST_FETCH_WIDTH - 3`

			`.Lbytecopy:`
			`#if XCHAL_HAVE_LOOPS`
			`loopnez a4, 2f`
			`#else`
			`beqz a4, 2f`
			`add a7, a3, a4 # a7 = end address for source`
			`#endif`
			`1: l8ui a6, a3, 0`
			`addi a3, a3, 1`
			`s8i a6, a5, 0`
			`addi a5, a5, 1`
			`#if !XCHAL_HAVE_LOOPS`
			`bltu a3, a7, 1b`
			`#endif`
			`2: RET(16)`

			`/* Destination is unaligned. */`

			`# .align 4`
			`.Ldst1mod2: # dst is only byte aligned`

			`/* Do short copies byte-by-byte. */`
			`bltui a4, 7, .Lbytecopy`

			`/* Copy 1 byte. */`
			`l8ui a6, a3, 0`
			`addi a3, a3, 1`
			`addi a4, a4, -1`
			`s8i a6, a5, 0`
			`addi a5, a5, 1`

			`/* Return to main algorithm if dst is now aligned. */`
			`bbci.l a5, 1, .Ldstaligned`

			`.Ldst2mod4: # dst has 16-bit alignment`

			`/* Do short copies byte-by-byte. */`
			`bltui a4, 6, .Lbytecopy`

			`/* Copy 2 bytes. */`
			`l8ui a6, a3, 0`
			`l8ui a7, a3, 1`
			`addi a3, a3, 2`
			`addi a4, a4, -2`
			`s8i a6, a5, 0`
			`s8i a7, a5, 1`
			`addi a5, a5, 2`

			`/* dst is now aligned; return to main algorithm. */`
			`j .Ldstaligned`

			`.end schedule`

			`.size memcpy, . - memcpy`
libc/string: prevent libc in the kernel/userspace optionally Add the `LIBC_PREVENT_STRING_KERNEL` and `LIBC_PREVENT_STRING_USER` that are meant to be selected by the chip if no libc implementation is going to be built. If selected, neither NuttX's software version of the libc nor any architecture-specific implementation will be built in the kernel or in the userspace, respectively. In this case, the linker may provide a ROM-defined version of the libc functions instead. 2023-04-28 18:55:23 +02:00
			`#endif`