mirror of
https://github.com/apache/nuttx.git
synced 2026-05-29 20:56:47 +08:00
arm: memcpy: add NEON paths for aligned copies
Add dedicated NEON implementations for mutually aligned medium and long memcpy copies when building with __ARM_NEON__. These paths use NEON multi-register loads and stores while preserving the existing VFP implementation for non-NEON VFP configurations. NEON builds also define USE_VFP, so select the NEON implementation explicitly before falling back to VFP. Apply the same aligned-copy optimization to the armv7-a, armv7-r, and armv8-r implementations. Signed-off-by: yaojiaqi <yaojiaqi@lixiang.com>
This commit is contained in:
@@ -260,7 +260,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
cmp tmp1, tmp2
|
cmp tmp1, tmp2
|
||||||
bne .Lcpy_notaligned
|
bne .Lcpy_notaligned
|
||||||
|
|
||||||
#ifdef USE_VFP
|
#if defined(USE_VFP) && !defined(USE_NEON)
|
||||||
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
||||||
that the FP pipeline is much better at streaming loads and
|
that the FP pipeline is much better at streaming loads and
|
||||||
stores. This is outside the critical loop. */
|
stores. This is outside the critical loop. */
|
||||||
@@ -290,7 +290,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
bge .Lcpy_body_long
|
bge .Lcpy_body_long
|
||||||
|
|
||||||
.Lcpy_body_medium: /* Count in tmp2. */
|
.Lcpy_body_medium: /* Count in tmp2. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with destination alignment
|
||||||
|
hints for aligned copies. */
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
|
||||||
|
.Ltail63aligned: /* Count in tmp2. */
|
||||||
|
/* Use NEON 8-byte vld1/vst1 for the tail. */
|
||||||
|
and tmp1, tmp2, #0x38
|
||||||
|
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
||||||
|
add pc, pc, tmp1
|
||||||
|
vld1.8 {d0}, [src]! /* 14 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 12 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 10 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 8 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 6 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 4 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 2 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
#elif defined(USE_VFP)
|
||||||
1:
|
1:
|
||||||
vldr d0, [src, #0]
|
vldr d0, [src, #0]
|
||||||
subs tmp2, tmp2, #64
|
subs tmp2, tmp2, #64
|
||||||
@@ -411,7 +444,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
|
|
||||||
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
||||||
bytes to go. */
|
bytes to go. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with prefetching for long
|
||||||
|
copies. */
|
||||||
|
pld [src, #0]
|
||||||
|
pld [src, #64]
|
||||||
|
pld [src, #128]
|
||||||
|
pld [src, #192]
|
||||||
|
pld [src, #256]
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
b .Ltail63aligned
|
||||||
|
#elif defined(USE_VFP)
|
||||||
/* Don't use PLD. Instead, read some data in advance of the current
|
/* Don't use PLD. Instead, read some data in advance of the current
|
||||||
copy position into a register. This should act like a PLD
|
copy position into a register. This should act like a PLD
|
||||||
operation but we won't have to repeat the transfer. */
|
operation but we won't have to repeat the transfer. */
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
cmp tmp1, tmp2
|
cmp tmp1, tmp2
|
||||||
bne .Lcpy_notaligned
|
bne .Lcpy_notaligned
|
||||||
|
|
||||||
#ifdef USE_VFP
|
#if defined(USE_VFP) && !defined(USE_NEON)
|
||||||
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
||||||
that the FP pipeline is much better at streaming loads and
|
that the FP pipeline is much better at streaming loads and
|
||||||
stores. This is outside the critical loop. */
|
stores. This is outside the critical loop. */
|
||||||
@@ -288,7 +288,40 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
bge .Lcpy_body_long
|
bge .Lcpy_body_long
|
||||||
|
|
||||||
.Lcpy_body_medium: /* Count in tmp2. */
|
.Lcpy_body_medium: /* Count in tmp2. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with destination alignment
|
||||||
|
hints for aligned copies. */
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
|
||||||
|
.Ltail63aligned: /* Count in tmp2. */
|
||||||
|
/* Use NEON 8-byte vld1/vst1 for the tail. */
|
||||||
|
and tmp1, tmp2, #0x38
|
||||||
|
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
||||||
|
add pc, pc, tmp1
|
||||||
|
vld1.8 {d0}, [src]! /* 14 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 12 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 10 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 8 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 6 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 4 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 2 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
#elif defined(USE_VFP)
|
||||||
1:
|
1:
|
||||||
vldr d0, [src, #0]
|
vldr d0, [src, #0]
|
||||||
subs tmp2, tmp2, #64
|
subs tmp2, tmp2, #64
|
||||||
@@ -409,7 +442,26 @@ def_fn ARCH_LIBCFUN(memcpy) p2align=6
|
|||||||
|
|
||||||
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
||||||
bytes to go. */
|
bytes to go. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with prefetching for long
|
||||||
|
copies. */
|
||||||
|
pld [src, #0]
|
||||||
|
pld [src, #64]
|
||||||
|
pld [src, #128]
|
||||||
|
pld [src, #192]
|
||||||
|
pld [src, #256]
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
b .Ltail63aligned
|
||||||
|
#elif defined(USE_VFP)
|
||||||
/* Don't use PLD. Instead, read some data in advance of the current
|
/* Don't use PLD. Instead, read some data in advance of the current
|
||||||
copy position into a register. This should act like a PLD
|
copy position into a register. This should act like a PLD
|
||||||
operation but we won't have to repeat the transfer. */
|
operation but we won't have to repeat the transfer. */
|
||||||
|
|||||||
@@ -258,7 +258,7 @@ def_fn memcpy p2align=6
|
|||||||
cmp tmp1, tmp2
|
cmp tmp1, tmp2
|
||||||
bne .Lcpy_notaligned
|
bne .Lcpy_notaligned
|
||||||
|
|
||||||
#ifdef USE_VFP
|
#if defined(USE_VFP) && !defined(USE_NEON)
|
||||||
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
/* Magic dust alert! Force VFP on Cortex-A9. Experiments show
|
||||||
that the FP pipeline is much better at streaming loads and
|
that the FP pipeline is much better at streaming loads and
|
||||||
stores. This is outside the critical loop. */
|
stores. This is outside the critical loop. */
|
||||||
@@ -288,7 +288,40 @@ def_fn memcpy p2align=6
|
|||||||
bge .Lcpy_body_long
|
bge .Lcpy_body_long
|
||||||
|
|
||||||
.Lcpy_body_medium: /* Count in tmp2. */
|
.Lcpy_body_medium: /* Count in tmp2. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with destination alignment
|
||||||
|
hints for aligned copies. */
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
|
||||||
|
.Ltail63aligned: /* Count in tmp2. */
|
||||||
|
/* Use NEON 8-byte vld1/vst1 for the tail. */
|
||||||
|
and tmp1, tmp2, #0x38
|
||||||
|
rsb tmp1, tmp1, #(56 - PC_OFFSET + INSN_SIZE)
|
||||||
|
add pc, pc, tmp1
|
||||||
|
vld1.8 {d0}, [src]! /* 14 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 12 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 10 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 8 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 6 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 4 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
vld1.8 {d0}, [src]! /* 2 words to go. */
|
||||||
|
vst1.8 {d0}, [dst]!
|
||||||
|
#elif defined(USE_VFP)
|
||||||
1:
|
1:
|
||||||
vldr d0, [src, #0]
|
vldr d0, [src, #0]
|
||||||
subs tmp2, tmp2, #64
|
subs tmp2, tmp2, #64
|
||||||
@@ -409,7 +442,26 @@ def_fn memcpy p2align=6
|
|||||||
|
|
||||||
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
/* Long copy. We know that there's at least (prefetch_lines * 64)
|
||||||
bytes to go. */
|
bytes to go. */
|
||||||
#ifdef USE_VFP
|
#ifdef USE_NEON
|
||||||
|
/* Use NEON multi-register transfers with prefetching for long
|
||||||
|
copies. */
|
||||||
|
pld [src, #0]
|
||||||
|
pld [src, #64]
|
||||||
|
pld [src, #128]
|
||||||
|
pld [src, #192]
|
||||||
|
pld [src, #256]
|
||||||
|
1:
|
||||||
|
vld1.8 {d0-d3}, [src]!
|
||||||
|
vld1.8 {d4-d7}, [src]!
|
||||||
|
pld [src, #(prefetch_lines * 64)]
|
||||||
|
subs tmp2, tmp2, #64
|
||||||
|
vst1.8 {d0-d3}, [ALIGN(dst, 64)]!
|
||||||
|
vst1.8 {d4-d7}, [ALIGN(dst, 64)]!
|
||||||
|
bge 1b
|
||||||
|
tst tmp2, #0x3f
|
||||||
|
beq .Ldone
|
||||||
|
b .Ltail63aligned
|
||||||
|
#elif defined(USE_VFP)
|
||||||
/* Don't use PLD. Instead, read some data in advance of the current
|
/* Don't use PLD. Instead, read some data in advance of the current
|
||||||
copy position into a register. This should act like a PLD
|
copy position into a register. This should act like a PLD
|
||||||
operation but we won't have to repeat the transfer. */
|
operation but we won't have to repeat the transfer. */
|
||||||
|
|||||||
Reference in New Issue
Block a user