arm: memcpy: add NEON paths for aligned copies

Add dedicated NEON implementations for mutually aligned medium and long memcpy copies when building with __ARM_NEON__. These paths use NEON multi-register loads and stores while preserving the existing VFP implementation for non-NEON VFP configurations.

NEON builds also define USE_VFP, so select the NEON implementation explicitly before falling back to VFP. Apply the same aligned-copy optimization to the armv7-a, armv7-r, and armv8-r implementations.

Signed-off-by: yaojiaqi <yaojiaqi@lixiang.com>
This commit is contained in:
yaojiaqi
2026-05-06 15:51:13 +08:00
committed by Xiang Xiao
parent 9c1b2e12ce
commit 131f7f75ed
3 changed files with 165 additions and 9 deletions
+55 -3
View File
@@ -260,7 +260,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2 cmp tmp1, tmp2
bne .Lcpy_notaligned bne .Lcpy_notaligned
#ifdef USE_VFP #if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */ stores. This is outside the critical loop. */
@@ -290,7 +290,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */ .Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1: 1:
vldr d0, [src, #0] vldr d0, [src, #0]
subs tmp2, tmp2, #64 subs tmp2, tmp2, #64
@@ -411,7 +444,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64) /* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */ bytes to go. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current /* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */ operation but we won't have to repeat the transfer. */
+55 -3
View File
@@ -258,7 +258,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2 cmp tmp1, tmp2
bne .Lcpy_notaligned bne .Lcpy_notaligned
#ifdef USE_VFP #if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */ stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */ .Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1: 1:
vldr d0, [src, #0] vldr d0, [src, #0]
subs tmp2, tmp2, #64 subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64) /* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */ bytes to go. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current /* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */ operation but we won't have to repeat the transfer. */
+55 -3
View File
@@ -258,7 +258,7 @@ def_fn memcpy p2align=6
cmp tmp1, tmp2 cmp tmp1, tmp2
bne .Lcpy_notaligned bne .Lcpy_notaligned
#ifdef USE_VFP #if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show /* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */ stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn memcpy p2align=6
bge .Lcpy_body_long bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */ .Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1: 1:
vldr d0, [src, #0] vldr d0, [src, #0]
subs tmp2, tmp2, #64 subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn memcpy p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64) /* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */ bytes to go. */
#ifdef USE_VFP #ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current /* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */ operation but we won't have to repeat the transfer. */