nuttx/libc/machine/arm/armv7-r/gnu/arch_memcpy.S

462 lines
13 KiB
ArmAsm
Raw Normal View History

/************************************************************************************
* libc/marchine/arm/armv7-r/arm_memcpy.S
2017-01-20 21:35:27 +01:00
* ARMv7-R optimized memcpy.
*
2017-01-20 21:35:27 +01:00
* Adapted for use with ARMv7-R and NuttX by:
*
* Copyright (C) 2017 Gregory Nutt. All rights reserved.
* Author: Gregory Nutt <gnutt@nuttx.org>
*
* Based on the ARMv7-M version contributed by Mike Smith. Apparently in the public
* domain and is re-released here under the modified BSD license:
*
* Obtained via a posting on the Stellaris forum:
* http://e2e.ti.com/support/microcontrollers/\
* stellaris_arm_cortex-m3_microcontroller/f/473/t/44360.aspx
*
* Posted by rocksoft on Jul 24, 2008 10:19 AM
*
* Hi,
*
* I recently finished a "memcpy" replacement and thought it might be useful for
* others...
*
* I've put some instructions and the code here:
*
* http://www.rock-software.net/downloads/memcpy/
*
* Hope it works for you as well as it did for me.
*
* Liam.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name NuttX nor the names of its contributors may be
* used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
************************************************************************************/
/************************************************************************************
* Public Symbols
************************************************************************************/
.global memcpy
.syntax unified
.file "arch_memcpy.S"
/************************************************************************************
* .text
************************************************************************************/
.text
/************************************************************************************
* Private Constant Data
************************************************************************************/
/* We have 16 possible alignment combinations of src and dst, this jump table
* directs the copy operation
*
* Bits: Src=00, Dst=00 - Long to Long copy
* Bits: Src=00, Dst=01 - Long to Byte before half word
* Bits: Src=00, Dst=10 - Long to Half word
* Bits: Src=00, Dst=11 - Long to Byte before long word
* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=01 - Byte before half word to byte before half word -
* Same alignment
* Bits: Src=01, Dst=10 - Byte before half word to half word
* Bits: Src=01, Dst=11 - Byte before half word to byte before long word
* Bits: Src=10, Dst=00 - Half word to long word
* Bits: Src=10, Dst=01 - Half word to byte before half word
* Bits: Src=10, Dst=10 - Half word to half word - Same Alignment
* Bits: Src=10, Dst=11 - Half word to byte before long word
* Bits: Src=11, Dst=00 - Byte before long word to long word
* Bits: Src=11, Dst=01 - Byte before long word to byte before half word
* Bits: Src=11, Dst=11 - Byte before long word to half word
* Bits: Src=11, Dst=11 - Byte before long word to Byte before long word -
* Same alignment
*/
MEM_DataCopyTable:
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy10 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy11 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy12 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy13 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy14 - MEM_DataCopyJump) >> 2
.byte (MEM_DataCopy15 - MEM_DataCopyJump) >> 2
.align 2
MEM_LongCopyTable:
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 2 /* 0 bytes left */
.byte (MEM_LongCopyJump0 - MEM_LongCopyJump) >> 2 /* 4 bytes left */
.byte (MEM_LongCopyJump1 - MEM_LongCopyJump) >> 2 /* 8 bytes left */
.byte (MEM_LongCopyJump2 - MEM_LongCopyJump) >> 2 /* 12 bytes left */
.byte (MEM_LongCopyJump3 - MEM_LongCopyJump) >> 2 /* 16 bytes left */
.byte (MEM_LongCopyJump4 - MEM_LongCopyJump) >> 2 /* 20 bytes left */
.byte (MEM_LongCopyJump5 - MEM_LongCopyJump) >> 2 /* 24 bytes left */
.byte (MEM_LongCopyJump6 - MEM_LongCopyJump) >> 2 /* 28 bytes left */
.byte (MEM_LongCopyJump7 - MEM_LongCopyJump) >> 2 /* 32 bytes left */
.byte (MEM_LongCopyJump8 - MEM_LongCopyJump) >> 2 /* 36 bytes left */
/************************************************************************************
* Public Functions
************************************************************************************/
/************************************************************************************
* Name: memcpy
*
* Description:
* Optimized "general" copy routine
*
* Input Parameters:
* r0 = destination, r1 = source, r2 = length
*
* Returned Value:
* r0 = destination r1-r3 burned
*
************************************************************************************/
.align 4
memcpy:
push {r14}
push {r0}
bl _do_memcpy
pop {r0}
pop {pc}
.align 4
_do_memcpy:
push {r14}
2017-01-20 20:33:50 +01:00
push {r4}
/* This allows the inner workings to "assume" a minimum amount of bytes */
/* Quickly check for very short copies */
cmp r2, #4
2017-01-20 20:33:50 +01:00
blt MEM_DataCopyBytes
and r14, r0, #3 /* Get destination alignment bits */
bfi r14, r1, #2, #2 /* Get source alignment bits */
2017-01-20 20:33:50 +01:00
ldr r3, =MEM_DataCopyTable /* Jump table base address */
ldrb r4, [r3, r14] /* DWord offset for this alignment combination */
2017-01-20 20:33:50 +01:00
ldr r3, =MEM_DataCopyJump /* Base of branch table anchor */
add r3, r3, r4, lsl #2 /* Absolute address of logic */
2017-01-20 20:33:50 +01:00
bx r3
/* data copy branch table anchor */
.align 4
2017-01-20 20:33:50 +01:00
MEM_DataCopyJump:
/* Bits: Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
* 3 bytes to read for long word aligning
*/
MEM_DataCopy5:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=10 - Half word to half word - Same Alignment
* 2 bytes to read for long word aligning
*/
MEM_DataCopy10:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=11 - Byte before long word to Byte before long word - Same alignment
* 1 bytes to read for long word aligning
*/
MEM_DataCopy15:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=00 - Long to Long copy */
MEM_DataCopy0:
/* Save regs that may be used by memcpy */
2017-01-20 20:33:50 +01:00
push {r5-r12}
/* Check for short word-aligned copy */
cmp r2, #0x28
2017-01-20 20:33:50 +01:00
blt MEM_DataCopy0_2
/* Bulk copy loop */
MEM_DataCopy0_1:
ldmia r1!, {r3-r12}
stmia r0!, {r3-r12}
sub r2, r2, #0x28
cmp r2, #0x28
2017-01-20 20:33:50 +01:00
bge MEM_DataCopy0_1
/* Copy remaining long words */
MEM_DataCopy0_2:
2017-01-20 20:33:50 +01:00
ldr r14, =MEM_LongCopyTable /* Jump table base address */
lsr r11, r2, 2 /* Convert byte count to word count */
add r14, r14, r11 /* Jump table offset address */
ldrb r3, [r14] /* DWord offset from branch table anchor */
2017-01-20 20:33:50 +01:00
ldr r11, =MEM_LongCopyJump /* Address of branch table anchor */
add r11, r11, r3, lsl #2 /* Absolute address into branch table */
2017-01-20 20:33:50 +01:00
bx r11 /* Go there */
/* longword copy branch table anchor */
MEM_LongCopyJump:
2017-01-20 20:33:50 +01:00
MEM_LongCopyJump0:
ldr r3, [r1], #0x04 /* 4 bytes remain */
str r3, [r0], #0x04
b MEM_LongCopyEnd
MEM_LongCopyJump1:
ldmia r1!, {r3-r4} /* 8 bytes remain */
stmia r0!, {r3-r4}
b MEM_LongCopyEnd
MEM_LongCopyJump2:
ldmia r1!, {r3-r5} /* 12 bytes remain */
stmia r0!, {r3-r5}
b MEM_LongCopyEnd
MEM_LongCopyJump3:
ldmia r1!, {r3-r6} /* 16 bytes remain */
stmia r0!, {r3-r6}
b MEM_LongCopyEnd
MEM_LongCopyJump4:
ldmia r1!, {r3-r7} /* 20 bytes remain */
stmia r0!, {r3-r7}
b MEM_LongCopyEnd
MEM_LongCopyJump5:
ldmia r1!, {r3-r8} /* 24 bytes remain */
stmia r0!, {r3-r8}
b MEM_LongCopyEnd
MEM_LongCopyJump6:
ldmia r1!, {r3-r9} /* 28 bytes remain */
stmia r0!, {r3-r9}
b MEM_LongCopyEnd
MEM_LongCopyJump7:
ldmia r1!, {r3-r10} /* 32 bytes remain */
stmia r0!, {r3-r10}
b MEM_LongCopyEnd
MEM_LongCopyJump8:
ldmia r1!, {r3-r11} /* 36 bytes remain */
stmia r0!, {r3-r11}
MEM_LongCopyEnd:
2017-01-20 20:33:50 +01:00
pop {r5-r12}
and r2, r2, #0x03 /* All the longs have been copied */
/* Deal with up to 3 remaining bytes */
MEM_DataCopyBytes:
/* Deal with up to 3 remaining bytes */
2017-01-20 20:33:50 +01:00
pop {r4}
cmp r2, #0x00
it eq
popeq {pc}
2017-01-20 20:33:50 +01:00
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
2017-01-20 20:33:50 +01:00
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
2017-01-20 20:33:50 +01:00
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
pop {pc}
.align 4
/* Bits: Src=01, Dst=11 - Byte before half word to byte before long word
* 3 bytes to read for long word aligning the source
*/
MEM_DataCopy7:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=00 - Half word to long word
* 2 bytes to read for long word aligning the source
*/
MEM_DataCopy8:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=01 - Byte before long word to byte before half word
* 1 byte to read for long word aligning the source
*/
MEM_DataCopy13:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=10 - Long to Half word */
MEM_DataCopy2:
cmp r2, #0x28
2017-01-20 20:33:50 +01:00
blt MEM_DataCopy2_1
/* Save regs */
2017-01-20 20:33:50 +01:00
push {r5-r12}
/* Bulk copy loop */
MEM_DataCopy2_2:
ldmia r1!, {r3-r12}
strh r3, [r0], #0x02
lsr r3, r3, #0x10
bfi r3, r4, #0x10, #0x10
lsr r4, r4, #0x10
bfi r4, r5, #0x10, #0x10
lsr r5, r5, #0x10
bfi r5, r6, #0x10, #0x10
lsr r6, r6, #0x10
bfi r6, r7, #0x10, #0x10
lsr r7, r7, #0x10
bfi r7, r8, #0x10, #0x10
lsr r8, r8, #0x10
bfi r8, r9, #0x10, #0x10
lsr r9, r9, #0x10
bfi r9, r10, #0x10, #0x10
lsr r10, r10, #0x10
bfi r10, r11, #0x10, #0x10
lsr r11, r11, #0x10
bfi r11, r12, #0x10, #0x10
stmia r0!, {r3-r11}
lsr r12, r12, #0x10
strh r12, [r0], #0x02
sub r2, r2, #0x28
cmp r2, #0x28
2017-01-20 20:33:50 +01:00
bge MEM_DataCopy2_2
pop {r5-r12}
MEM_DataCopy2_1: /* Read longs and write 2 x half words */
cmp r2, #4
2017-01-20 20:33:50 +01:00
blt MEM_DataCopyBytes
ldr r3, [r1], #0x04
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strh r3, [r0], #0x02
sub r2, r2, #0x04
2017-01-20 20:33:50 +01:00
b MEM_DataCopy2
/* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=10 - Byte before half word to half word
* 3 bytes to read for long word aligning the source
*/
MEM_DataCopy4:
MEM_DataCopy6:
/* Read B and write B */
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=01 - Half word to byte before half word
* Bits: Src=10, Dst=11 - Half word to byte before long word
* 2 bytes to read for long word aligning the source
*/
MEM_DataCopy9:
MEM_DataCopy11:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=00 -chm Byte before long word to long word
* Bits: Src=11, Dst=11 - Byte before long word to half word
* 1 byte to read for long word aligning the source
*/
MEM_DataCopy12:
MEM_DataCopy14:
/* Read B and write B */
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=01 - Long to Byte before half word
* Bits: Src=00, Dst=11 - Long to Byte before long word
*/
MEM_DataCopy1: /* Read longs, write B->H->B */
MEM_DataCopy3:
cmp r2, #4
blt MEM_DataCopyBytes
ldr r3, [r1], #0x04
strb r3, [r0], #0x01
lsr r3, r3, #0x08
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strb r3, [r0], #0x01
sub r2, r2, #0x04
2017-01-20 20:33:50 +01:00
b MEM_DataCopy3
.size memcpy, .-memcpy
.end