arm: memcpy: add NEON paths for aligned copies

Add dedicated NEON implementations for mutually aligned medium and long memcpy copies when building with __ARM_NEON__. These paths use NEON multi-register loads and stores while preserving the existing VFP implementation for non-NEON VFP configurations.

NEON builds also define USE_VFP, so select the NEON implementation explicitly before falling back to VFP. Apply the same aligned-copy optimization to the armv7-a, armv7-r, and armv8-r implementations.

Signed-off-by: yaojiaqi <yaojiaqi@lixiang.com>
This commit is contained in:
yaojiaqi
2026-05-06 15:51:13 +08:00
committed by Xiang Xiao
parent 9c1b2e12ce
commit 131f7f75ed
3 changed files with 165 additions and 9 deletions
+55 -3
View File
@@ -260,7 +260,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
#ifdef USE_VFP
#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -290,7 +290,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -411,7 +444,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
+55 -3
View File
@@ -258,7 +258,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
#ifdef USE_VFP
#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */
+55 -3
View File
@@ -258,7 +258,7 @@ def_fn memcpy p2align=6
cmp tmp1, tmp2
bne .Lcpy_notaligned
#ifdef USE_VFP
#if defined(USE_VFP) && !defined(USE_NEON)
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
that the FP pipeline is much better at streaming loads and
stores. This is outside the critical loop. */
@@ -288,7 +288,40 @@ def_fn memcpy p2align=6
bge .Lcpy_body_long
.Lcpy_body_medium: /* Count in tmp2. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with destination alignment
hints for aligned copies. */
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
.Ltail63aligned: /* Count in tmp2. */
/* Use NEON 8-byte vld1/vst1 for the tail. */
and tmp1, tmp2, #0x38
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
add pc, pc, tmp1
vld1.8 {d0}, [src]! /* 14 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 12 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 10 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 8 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 6 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 4 words to go. */
vst1.8 {d0}, [dst]!
vld1.8 {d0}, [src]! /* 2 words to go. */
vst1.8 {d0}, [dst]!
#elif defined(USE_VFP)
1:
vldr d0, [src, #0]
subs tmp2, tmp2, #64
@@ -409,7 +442,26 @@ def_fn memcpy p2align=6
/* Long copy. We know that there's at least (prefetch_lines * 64)
bytes to go. */
#ifdef USE_VFP
#ifdef USE_NEON
/* Use NEON multi-register transfers with prefetching for long
copies. */
pld [src, #0]
pld [src, #64]
pld [src, #128]
pld [src, #192]
pld [src, #256]
1:
vld1.8 {d0-d3}, [src]!
vld1.8 {d4-d7}, [src]!
pld [src, #(prefetch_lines * 64)]
subs tmp2, tmp2, #64
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
bge 1b
tst tmp2, #0x3f
beq .Ldone
b .Ltail63aligned
#elif defined(USE_VFP)
/* Don't use PLD. Instead, read some data in advance of the current
copy position into a register. This should act like a PLD
operation but we won't have to repeat the transfer. */