nuttx/arch/arm/src/armv7-a/arm_memcpy.S

418 lines
12 KiB
ArmAsm

/************************************************************************************
* nuttx/arch/arm/src/armv7-a/arm_memcpy.S
*
* ARMv7-A optimised memcpy, based on the ARMv7-M version contributed by Mike Smith.
* Apparently in the public domain and is re-released here under the modified BSD
* license:
*
* Obtained via a posting on the Stellaris forum:
* http://e2e.ti.com/support/microcontrollers/\
* stellaris_arm_cortex-m3_microcontroller/f/473/t/44360.aspx
*
* Posted by rocksoft on Jul 24, 2008 10:19 AM
*
* Hi,
*
* I recently finished a "memcpy" replacement and thought it might be useful for
* others...
*
* I've put some instructions and the code here:
*
* http://www.rock-software.net/downloads/memcpy/
*
* Hope it works for you as well as it did for me.
*
* Liam.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
*
* 1. Redistributions of source code must retain the above copyright
* notice, this list of conditions and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in
* the documentation and/or other materials provided with the
* distribution.
* 3. Neither the name NuttX nor the names of its contributors may be
* used to endorse or promote products derived from this software
* without specific prior written permission.
*
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
* "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
* LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
* FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
* COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
* BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
* OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
* AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
* LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
* ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
* POSSIBILITY OF SUCH DAMAGE.
*
************************************************************************************/
/************************************************************************************
* Global Symbols
************************************************************************************/
.global memcpy
.syntax unified
.thumb
.file "arm_memcpy.S"
/************************************************************************************
* .text
************************************************************************************/
.text
/************************************************************************************
* Private Constant Data
************************************************************************************/
/* We have 16 possible alignment combinations of src and dst, this jump table
* directs the copy operation
*
* Bits: Src=00, Dst=00 - Long to Long copy
* Bits: Src=00, Dst=01 - Long to Byte before half word
* Bits: Src=00, Dst=10 - Long to Half word
* Bits: Src=00, Dst=11 - Long to Byte before long word
* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=01 - Byte before half word to byte before half word -
* Same alignment
* Bits: Src=01, Dst=10 - Byte before half word to half word
* Bits: Src=01, Dst=11 - Byte before half word to byte before long word
* Bits: Src=10, Dst=00 - Half word to long word
* Bits: Src=10, Dst=01 - Half word to byte before half word
* Bits: Src=10, Dst=10 - Half word to half word - Same Alignment
* Bits: Src=10, Dst=11 - Half word to byte before long word
* Bits: Src=11, Dst=00 - Byte before long word to long word
* Bits: Src=11, Dst=01 - Byte before long word to byte before half word
* Bits: Src=11, Dst=11 - Byte before long word to half word
* Bits: Src=11, Dst=11 - Byte before long word to Byte before long word -
* Same alignment
*/
MEM_DataCopyTable:
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy10 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy11 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy12 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy13 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy14 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy15 - MEM_DataCopyJump) >> 1
.align 2
MEM_LongCopyTable:
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */
.byte 0 /* 4 bytes left */
.byte (1 * 10) >> 1 /* 8 bytes left */
.byte (2 * 10) >> 1 /* 12 bytes left */
.byte (3 * 10) >> 1 /* 16 bytes left */
.byte (4 * 10) >> 1 /* 20 bytes left */
.byte (5 * 10) >> 1 /* 24 bytes left */
.byte (6 * 10) >> 1 /* 28 bytes left */
.byte (7 * 10) >> 1 /* 32 bytes left */
.byte (8 * 10) >> 1 /* 36 bytes left */
.align 2
/************************************************************************************
* Public Functions
************************************************************************************/
/************************************************************************************
* Name: memcpy
*
* Description:
* Optimised "general" copy routine
*
* Input Parameters:
* r0 = destination, r1 = source, r2 = length
*
************************************************************************************/
.thumb_func
memcpy:
push {r14}
/* This allows the inner workings to "assume" a minimum amount of bytes */
/* Quickly check for very short copies */
cmp r2, #4
blt.n MEM_DataCopyBytes
and r14, r0, #3 /* Get destination alignment bits */
bfi r14, r1, #2, #2 /* Get source alignment bits */
ldr r3, =MEM_DataCopyTable /* Jump table base */
tbb [r3, r14] /* Perform jump on src/dst alignment bits */
MEM_DataCopyJump:
.align 4
/* Bits: Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
* 3 bytes to read for long word aligning
*/
MEM_DataCopy5:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=10 - Half word to half word - Same Alignment
* 2 bytes to read for long word aligning
*/
MEM_DataCopy10:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=11 - Byte before long word to Byte before long word - Same alignment
* 1 bytes to read for long word aligning
*/
MEM_DataCopy15:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=00 - Long to Long copy */
MEM_DataCopy0:
/* Save regs that may be used by memcpy */
push {r4-r12}
/* Check for short word-aligned copy */
cmp r2, #0x28
blt.n MEM_DataCopy0_2
/* Bulk copy loop */
MEM_DataCopy0_1:
ldmia r1!, {r3-r12}
stmia r0!, {r3-r12}
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy0_1
/* Copy remaining long words */
MEM_DataCopy0_2:
/* Copy remaining long words */
ldr r14, =MEM_LongCopyTable
lsr r11, r2, #0x02
tbb [r14, r11]
/* longword copy branch table anchor */
MEM_LongCopyJump:
ldr.w r3, [r1], #0x04 /* 4 bytes remain */
str.w r3, [r0], #0x04
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r4} /* 8 bytes remain */
stmia.w r0!, {r3-r4}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r5} /* 12 bytes remain */
stmia.w r0!, {r3-r5}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r6} /* 16 bytes remain */
stmia.w r0!, {r3-r6}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r7} /* 20 bytes remain */
stmia.w r0!, {r3-r7}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r8} /* 24 bytes remain */
stmia.w r0!, {r3-r8}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r9} /* 28 bytes remain */
stmia.w r0!, {r3-r9}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r10} /* 32 bytes remain */
stmia.w r0!, {r3-r10}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r11} /* 36 bytes remain */
stmia.w r0!, {r3-r11}
MEM_LongCopyEnd:
pop {r4-r12}
and r2, r2, #0x03 /* All the longs have been copied */
/* Deal with up to 3 remaining bytes */
MEM_DataCopyBytes:
/* Deal with up to 3 remaining bytes */
cmp r2, #0x00
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
pop {pc}
.align 4
/* Bits: Src=01, Dst=11 - Byte before half word to byte before long word
* 3 bytes to read for long word aligning the source
*/
MEM_DataCopy7:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=00 - Half word to long word
* 2 bytes to read for long word aligning the source
*/
MEM_DataCopy8:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=01 - Byte before long word to byte before half word
* 1 byte to read for long word aligning the source
*/
MEM_DataCopy13:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=10 - Long to Half word */
MEM_DataCopy2:
cmp r2, #0x28
blt.n MEM_DataCopy2_1
/* Save regs */
push {r4-r12}
/* Bulk copy loop */
MEM_DataCopy2_2:
ldmia r1!, {r3-r12}
strh r3, [r0], #0x02
lsr r3, r3, #0x10
bfi r3, r4, #0x10, #0x10
lsr r4, r4, #0x10
bfi r4, r5, #0x10, #0x10
lsr r5, r5, #0x10
bfi r5, r6, #0x10, #0x10
lsr r6, r6, #0x10
bfi r6, r7, #0x10, #0x10
lsr r7, r7, #0x10
bfi r7, r8, #0x10, #0x10
lsr r8, r8, #0x10
bfi r8, r9, #0x10, #0x10
lsr r9, r9, #0x10
bfi r9, r10, #0x10, #0x10
lsr r10, r10, #0x10
bfi r10, r11, #0x10, #0x10
lsr r11, r11, #0x10
bfi r11, r12, #0x10, #0x10
stmia r0!, {r3-r11}
lsr r12, r12, #0x10
strh r12, [r0], #0x02
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy2_2
pop {r4-r12}
MEM_DataCopy2_1: /* Read longs and write 2 x half words */
cmp r2, #4
blt.n MEM_DataCopyBytes
ldr r3, [r1], #0x04
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strh r3, [r0], #0x02
sub r2, r2, #0x04
b.n MEM_DataCopy2
/* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=10 - Byte before half word to half word
* 3 bytes to read for long word aligning the source
*/
MEM_DataCopy4:
MEM_DataCopy6:
/* Read B and write B */
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=10, Dst=01 - Half word to byte before half word
* Bits: Src=10, Dst=11 - Half word to byte before long word
* 2 bytes to read for long word aligning the source
*/
MEM_DataCopy9:
MEM_DataCopy11:
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=11, Dst=00 -chm Byte before long word to long word
* Bits: Src=11, Dst=11 - Byte before long word to half word
* 1 byte to read for long word aligning the source
*/
MEM_DataCopy12:
MEM_DataCopy14:
/* Read B and write B */
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
sub r2, r2, #0x01
/* Bits: Src=00, Dst=01 - Long to Byte before half word
* Bits: Src=00, Dst=11 - Long to Byte before long word
*/
MEM_DataCopy1: /* Read longs, write B->H->B */
MEM_DataCopy3:
cmp r2, #4
blt MEM_DataCopyBytes
ldr r3, [r1], #0x04
strb r3, [r0], #0x01
lsr r3, r3, #0x08
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strb r3, [r0], #0x01
sub r2, r2, #0x04
b.n MEM_DataCopy3
.size memcpy, .-memcpy
.end