libc: Fix ARMv7-A/R memcpy assembly.

This commit is contained in:
Gregory Nutt
2017-01-20 13:33:50 -06:00
parent be5ba90d4f
commit a8a66667c1
4 changed files with 195 additions and 174 deletions
+1 -4
View File
@@ -6,9 +6,6 @@
config ARMV7A_MEMCPY
bool "Enable optimized memcpy() for ARMv7-A"
select LIBC_ARCH_MEMCPY
depends on ARM_TOOLCHAIN_GNU && EXPERIMENTAL
depends on ARM_TOOLCHAIN_GNU
---help---
Enable optimized ARMv7-A specific memcpy() library function
Marked EXPERIMENTAL because it did not build for me the last time I
tried.
+97 -77
View File
@@ -58,15 +58,6 @@
************************************************************************************/
.global memcpy
#if defined(CONFIG_ARCH_CORTEXA5)
.cpu cortex-a5
#elif defined(CONFIG_ARCH_CORTEXA8)
.cpu cortex-a8
#elif defined(CONFIG_ARCH_CORTEXA9)
.cpu cortex-a9
#endif
.syntax unified
.file "arch_memcpy.S"
@@ -104,16 +95,16 @@
*/
MEM_DataCopyTable:
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy10 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy11 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy12 - MEM_DataCopyJump) >> 1
@@ -124,18 +115,16 @@ MEM_DataCopyTable:
.align 2
MEM_LongCopyTable:
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */
.byte 0 /* 4 bytes left */
.byte (1 * 10) >> 1 /* 8 bytes left */
.byte (2 * 10) >> 1 /* 12 bytes left */
.byte (3 * 10) >> 1 /* 16 bytes left */
.byte (4 * 10) >> 1 /* 20 bytes left */
.byte (5 * 10) >> 1 /* 24 bytes left */
.byte (6 * 10) >> 1 /* 28 bytes left */
.byte (7 * 10) >> 1 /* 32 bytes left */
.byte (8 * 10) >> 1 /* 36 bytes left */
.align 2
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */
.byte (MEM_LongCopyJump0 - MEM_LongCopyJump) >> 1 /* 4 bytes left */
.byte (MEM_LongCopyJump1 - MEM_LongCopyJump) >> 1 /* 8 bytes left */
.byte (MEM_LongCopyJump2 - MEM_LongCopyJump) >> 1 /* 12 bytes left */
.byte (MEM_LongCopyJump3 - MEM_LongCopyJump) >> 1 /* 16 bytes left */
.byte (MEM_LongCopyJump4 - MEM_LongCopyJump) >> 1 /* 20 bytes left */
.byte (MEM_LongCopyJump5 - MEM_LongCopyJump) >> 1 /* 24 bytes left */
.byte (MEM_LongCopyJump6 - MEM_LongCopyJump) >> 1 /* 28 bytes left */
.byte (MEM_LongCopyJump7 - MEM_LongCopyJump) >> 1 /* 32 bytes left */
.byte (MEM_LongCopyJump8 - MEM_LongCopyJump) >> 1 /* 36 bytes left */
/************************************************************************************
* Public Functions
@@ -167,20 +156,27 @@ memcpy:
_do_memcpy:
push {r14}
push {r4}
/* This allows the inner workings to "assume" a minimum amount of bytes */
/* Quickly check for very short copies */
cmp r2, #4
blt.n MEM_DataCopyBytes
blt MEM_DataCopyBytes
and r14, r0, #3 /* Get destination alignment bits */
bfi r14, r1, #2, #2 /* Get source alignment bits */
ldr r3, =MEM_DataCopyTable /* Jump table base */
tbb [r3, r14] /* Perform jump on src/dst alignment bits */
MEM_DataCopyJump:
ldr r3, =MEM_DataCopyTable /* Jump table base address */
ldrb r4, [r3, r14] /* Hword offset for this alignment combination */
ldr r3, =MEM_DataCopyJump /* Base of branch table anchor */
add r3, r3, r4, lsl #1 /* Absolute address of logic */
bx r3
/* data copy branch table anchor */
.align 4
MEM_DataCopyJump:
/* Bits: Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
* 3 bytes to read for long word aligning
@@ -214,12 +210,12 @@ MEM_DataCopy15:
MEM_DataCopy0:
/* Save regs that may be used by memcpy */
push {r4-r12}
push {r5-r12}
/* Check for short word-aligned copy */
cmp r2, #0x28
blt.n MEM_DataCopy0_2
blt MEM_DataCopy0_2
/* Bulk copy loop */
@@ -228,49 +224,69 @@ MEM_DataCopy0_1:
stmia r0!, {r3-r12}
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy0_1
bge MEM_DataCopy0_1
/* Copy remaining long words */
MEM_DataCopy0_2:
/* Copy remaining long words */
ldr r14, =MEM_LongCopyTable
lsr r11, r2, #0x02
tbb [r14, r11]
ldr r14, =MEM_LongCopyTable /* Jump table base address */
lsr r11, r2, 2 /* Convert byte count to word count */
add r14, r14, r11 /* Jump table offset address */
ldrb r3, [r14] /* HWord offset from branch table anchor */
ldr r11, =MEM_LongCopyJump /* Address of branch table anchor */
add r11, r11, r3, lsl #1 /* Absolute address into branch table */
bx r11 /* Go there */
/* longword copy branch table anchor */
MEM_LongCopyJump:
ldr.w r3, [r1], #0x04 /* 4 bytes remain */
str.w r3, [r0], #0x04
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r4} /* 8 bytes remain */
stmia.w r0!, {r3-r4}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r5} /* 12 bytes remain */
stmia.w r0!, {r3-r5}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r6} /* 16 bytes remain */
stmia.w r0!, {r3-r6}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r7} /* 20 bytes remain */
stmia.w r0!, {r3-r7}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r8} /* 24 bytes remain */
stmia.w r0!, {r3-r8}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r9} /* 28 bytes remain */
stmia.w r0!, {r3-r9}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r10} /* 32 bytes remain */
stmia.w r0!, {r3-r10}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r11} /* 36 bytes remain */
stmia.w r0!, {r3-r11}
MEM_LongCopyJump0:
ldr r3, [r1], #0x04 /* 4 bytes remain */
str r3, [r0], #0x04
b MEM_LongCopyEnd
MEM_LongCopyJump1:
ldmia r1!, {r3-r4} /* 8 bytes remain */
stmia r0!, {r3-r4}
b MEM_LongCopyEnd
MEM_LongCopyJump2:
ldmia r1!, {r3-r5} /* 12 bytes remain */
stmia r0!, {r3-r5}
b MEM_LongCopyEnd
MEM_LongCopyJump3:
ldmia r1!, {r3-r6} /* 16 bytes remain */
stmia r0!, {r3-r6}
b MEM_LongCopyEnd
MEM_LongCopyJump4:
ldmia r1!, {r3-r7} /* 20 bytes remain */
stmia r0!, {r3-r7}
b MEM_LongCopyEnd
MEM_LongCopyJump5:
ldmia r1!, {r3-r8} /* 24 bytes remain */
stmia r0!, {r3-r8}
b MEM_LongCopyEnd
MEM_LongCopyJump6:
ldmia r1!, {r3-r9} /* 28 bytes remain */
stmia r0!, {r3-r9}
b MEM_LongCopyEnd
MEM_LongCopyJump7:
ldmia r1!, {r3-r10} /* 32 bytes remain */
stmia r0!, {r3-r10}
b MEM_LongCopyEnd
MEM_LongCopyJump8:
ldmia r1!, {r3-r11} /* 36 bytes remain */
stmia r0!, {r3-r11}
MEM_LongCopyEnd:
pop {r4-r12}
pop {r5-r12}
and r2, r2, #0x03 /* All the longs have been copied */
/* Deal with up to 3 remaining bytes */
@@ -278,19 +294,23 @@ MEM_LongCopyEnd:
MEM_DataCopyBytes:
/* Deal with up to 3 remaining bytes */
pop {r4}
cmp r2, #0x00
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
pop {pc}
@@ -328,11 +348,11 @@ MEM_DataCopy13:
MEM_DataCopy2:
cmp r2, #0x28
blt.n MEM_DataCopy2_1
blt MEM_DataCopy2_1
/* Save regs */
push {r4-r12}
push {r5-r12}
/* Bulk copy loop */
@@ -365,18 +385,18 @@ MEM_DataCopy2_2:
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy2_2
pop {r4-r12}
bge MEM_DataCopy2_2
pop {r5-r12}
MEM_DataCopy2_1: /* Read longs and write 2 x half words */
cmp r2, #4
blt.n MEM_DataCopyBytes
blt MEM_DataCopyBytes
ldr r3, [r1], #0x04
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strh r3, [r0], #0x02
sub r2, r2, #0x04
b.n MEM_DataCopy2
b MEM_DataCopy2
/* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=10 - Byte before half word to half word
@@ -430,7 +450,7 @@ MEM_DataCopy3:
lsr r3, r3, #0x10
strb r3, [r0], #0x01
sub r2, r2, #0x04
b.n MEM_DataCopy3
b MEM_DataCopy3
.size memcpy, .-memcpy
.end
-10
View File
@@ -57,18 +57,8 @@
************************************************************************************/
.global memcpy
.syntax unified
.thumb
#if defined(CONFIG_ARCH_CORTEXM3)
.cpu cortex-m3
#elif defined(CONFIG_ARCH_CORTEXM4)
.cpu cortex-m4
#elif defined(CONFIG_ARCH_CORTEXM7)
.cpu cortex-m7
#endif
.file "arch_memcpy.S"
/************************************************************************************
+97 -83
View File
@@ -58,21 +58,6 @@
************************************************************************************/
.global memcpy
#if defined(CONFIG_ARCH_CORTEXR4)
.cpu cortex-r4
#elif defined(CONFIG_ARCH_CORTEXR4F)
.cpu cortex-r4f
#elif defined(CONFIG_ARCH_CORTEXR5)
.cpu cortex-r5
#elif defined(CONFIG_ARCH_CORTEXR6F)
.cpu cortex-r5f
#elif defined(CONFIG_ARCH_CORTEXR7)
.cpu cortex-r7
#elif defined(CONFIG_ARCH_CORTEXR7F)
.cpu cortex-r7f
endif
.syntax unified
.file "arch_memcpy.S"
@@ -110,16 +95,16 @@ endif
*/
MEM_DataCopyTable:
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy0 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy1 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy2 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy3 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy4 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy5 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy6 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy7 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy8 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy9 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy10 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy11 - MEM_DataCopyJump) >> 1
.byte (MEM_DataCopy12 - MEM_DataCopyJump) >> 1
@@ -130,18 +115,16 @@ MEM_DataCopyTable:
.align 2
MEM_LongCopyTable:
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */
.byte 0 /* 4 bytes left */
.byte (1 * 10) >> 1 /* 8 bytes left */
.byte (2 * 10) >> 1 /* 12 bytes left */
.byte (3 * 10) >> 1 /* 16 bytes left */
.byte (4 * 10) >> 1 /* 20 bytes left */
.byte (5 * 10) >> 1 /* 24 bytes left */
.byte (6 * 10) >> 1 /* 28 bytes left */
.byte (7 * 10) >> 1 /* 32 bytes left */
.byte (8 * 10) >> 1 /* 36 bytes left */
.align 2
.byte (MEM_LongCopyEnd - MEM_LongCopyJump) >> 1 /* 0 bytes left */
.byte (MEM_LongCopyJump0 - MEM_LongCopyJump) >> 1 /* 4 bytes left */
.byte (MEM_LongCopyJump1 - MEM_LongCopyJump) >> 1 /* 8 bytes left */
.byte (MEM_LongCopyJump2 - MEM_LongCopyJump) >> 1 /* 12 bytes left */
.byte (MEM_LongCopyJump3 - MEM_LongCopyJump) >> 1 /* 16 bytes left */
.byte (MEM_LongCopyJump4 - MEM_LongCopyJump) >> 1 /* 20 bytes left */
.byte (MEM_LongCopyJump5 - MEM_LongCopyJump) >> 1 /* 24 bytes left */
.byte (MEM_LongCopyJump6 - MEM_LongCopyJump) >> 1 /* 28 bytes left */
.byte (MEM_LongCopyJump7 - MEM_LongCopyJump) >> 1 /* 32 bytes left */
.byte (MEM_LongCopyJump8 - MEM_LongCopyJump) >> 1 /* 36 bytes left */
/************************************************************************************
* Public Functions
@@ -173,20 +156,27 @@ memcpy:
_do_memcpy:
push {r14}
push {r4}
/* This allows the inner workings to "assume" a minimum amount of bytes */
/* Quickly check for very short copies */
cmp r2, #4
blt.n MEM_DataCopyBytes
blt MEM_DataCopyBytes
and r14, r0, #3 /* Get destination alignment bits */
bfi r14, r1, #2, #2 /* Get source alignment bits */
ldr r3, =MEM_DataCopyTable /* Jump table base */
tbb [r3, r14] /* Perform jump on src/dst alignment bits */
MEM_DataCopyJump:
ldr r3, =MEM_DataCopyTable /* Jump table base address */
ldrb r4, [r3, r14] /* Hword offset for this alignment combination */
ldr r3, =MEM_DataCopyJump /* Base of branch table anchor */
add r3, r3, r4, lsl #1 /* Absolute address of logic */
bx r3
/* data copy branch table anchor */
.align 4
MEM_DataCopyJump:
/* Bits: Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
* 3 bytes to read for long word aligning
@@ -220,12 +210,12 @@ MEM_DataCopy15:
MEM_DataCopy0:
/* Save regs that may be used by memcpy */
push {r4-r12}
push {r5-r12}
/* Check for short word-aligned copy */
cmp r2, #0x28
blt.n MEM_DataCopy0_2
blt MEM_DataCopy0_2
/* Bulk copy loop */
@@ -234,49 +224,69 @@ MEM_DataCopy0_1:
stmia r0!, {r3-r12}
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy0_1
bge MEM_DataCopy0_1
/* Copy remaining long words */
MEM_DataCopy0_2:
/* Copy remaining long words */
ldr r14, =MEM_LongCopyTable
lsr r11, r2, #0x02
tbb [r14, r11]
ldr r14, =MEM_LongCopyTable /* Jump table base address */
lsr r11, r2, 2 /* Convert byte count to word count */
add r14, r14, r11 /* Jump table offset address */
ldrb r3, [r14] /* HWord offset from branch table anchor */
ldr r11, =MEM_LongCopyJump /* Address of branch table anchor */
add r11, r11, r3, lsl #1 /* Absolute address into branch table */
bx r11 /* Go there */
/* longword copy branch table anchor */
MEM_LongCopyJump:
ldr.w r3, [r1], #0x04 /* 4 bytes remain */
str.w r3, [r0], #0x04
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r4} /* 8 bytes remain */
stmia.w r0!, {r3-r4}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r5} /* 12 bytes remain */
stmia.w r0!, {r3-r5}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r6} /* 16 bytes remain */
stmia.w r0!, {r3-r6}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r7} /* 20 bytes remain */
stmia.w r0!, {r3-r7}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r8} /* 24 bytes remain */
stmia.w r0!, {r3-r8}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r9} /* 28 bytes remain */
stmia.w r0!, {r3-r9}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r10} /* 32 bytes remain */
stmia.w r0!, {r3-r10}
b.n MEM_LongCopyEnd
ldmia.w r1!, {r3-r11} /* 36 bytes remain */
stmia.w r0!, {r3-r11}
MEM_LongCopyJump0:
ldr r3, [r1], #0x04 /* 4 bytes remain */
str r3, [r0], #0x04
b MEM_LongCopyEnd
MEM_LongCopyJump1:
ldmia r1!, {r3-r4} /* 8 bytes remain */
stmia r0!, {r3-r4}
b MEM_LongCopyEnd
MEM_LongCopyJump2:
ldmia r1!, {r3-r5} /* 12 bytes remain */
stmia r0!, {r3-r5}
b MEM_LongCopyEnd
MEM_LongCopyJump3:
ldmia r1!, {r3-r6} /* 16 bytes remain */
stmia r0!, {r3-r6}
b MEM_LongCopyEnd
MEM_LongCopyJump4:
ldmia r1!, {r3-r7} /* 20 bytes remain */
stmia r0!, {r3-r7}
b MEM_LongCopyEnd
MEM_LongCopyJump5:
ldmia r1!, {r3-r8} /* 24 bytes remain */
stmia r0!, {r3-r8}
b MEM_LongCopyEnd
MEM_LongCopyJump6:
ldmia r1!, {r3-r9} /* 28 bytes remain */
stmia r0!, {r3-r9}
b MEM_LongCopyEnd
MEM_LongCopyJump7:
ldmia r1!, {r3-r10} /* 32 bytes remain */
stmia r0!, {r3-r10}
b MEM_LongCopyEnd
MEM_LongCopyJump8:
ldmia r1!, {r3-r11} /* 36 bytes remain */
stmia r0!, {r3-r11}
MEM_LongCopyEnd:
pop {r4-r12}
pop {r5-r12}
and r2, r2, #0x03 /* All the longs have been copied */
/* Deal with up to 3 remaining bytes */
@@ -284,19 +294,23 @@ MEM_LongCopyEnd:
MEM_DataCopyBytes:
/* Deal with up to 3 remaining bytes */
pop {r4}
cmp r2, #0x00
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
subs r2, r2, #0x01
it eq
popeq {pc}
ldrb r3, [r1], #0x01
strb r3, [r0], #0x01
pop {pc}
@@ -334,11 +348,11 @@ MEM_DataCopy13:
MEM_DataCopy2:
cmp r2, #0x28
blt.n MEM_DataCopy2_1
blt MEM_DataCopy2_1
/* Save regs */
push {r4-r12}
push {r5-r12}
/* Bulk copy loop */
@@ -371,18 +385,18 @@ MEM_DataCopy2_2:
sub r2, r2, #0x28
cmp r2, #0x28
bge.n MEM_DataCopy2_2
pop {r4-r12}
bge MEM_DataCopy2_2
pop {r5-r12}
MEM_DataCopy2_1: /* Read longs and write 2 x half words */
cmp r2, #4
blt.n MEM_DataCopyBytes
blt MEM_DataCopyBytes
ldr r3, [r1], #0x04
strh r3, [r0], #0x02
lsr r3, r3, #0x10
strh r3, [r0], #0x02
sub r2, r2, #0x04
b.n MEM_DataCopy2
b MEM_DataCopy2
/* Bits: Src=01, Dst=00 - Byte before half word to long
* Bits: Src=01, Dst=10 - Byte before half word to half word
@@ -436,7 +450,7 @@ MEM_DataCopy3:
lsr r3, r3, #0x10
strb r3, [r0], #0x01
sub r2, r2, #0x04
b.n MEM_DataCopy3
b MEM_DataCopy3
.size memcpy, .-memcpy
.end