diff --git a/libs/libc/machine/arm/armv7-m/gnu/arch_memcpy.S b/libs/libc/machine/arm/armv7-m/gnu/arch_memcpy.S
index 6c1b62f7fb2..7e7c752c3b2 100644
--- a/libs/libc/machine/arm/armv7-m/gnu/arch_memcpy.S
+++ b/libs/libc/machine/arm/armv7-m/gnu/arch_memcpy.S
@@ -1,429 +1,329 @@
-/************************************************************************************
- * libs/libc/machine/arm/armv7-m/gnu/arch_memcpy.S
- *
- * armv7m-optimized memcpy, contributed by Mike Smith.  Apparently in the public
- * domain and is re-released here under the modified BSD license:
- *
- * Obtained via a posting on the Stellaris forum:
- *  http://e2e.ti.com/support/microcontrollers/\
- *       stellaris_arm_cortex-m3_microcontroller/f/473/t/44360.aspx
- *
- * Posted by rocksoft on Jul 24, 2008 10:19 AM
- *
- *   Hi,
- *
- *   I recently finished a "memcpy" replacement and thought it might be useful for
- *   others...
- *
- *   I've put some instructions and the code here:
- *
- *   http://www.rock-software.net/downloads/memcpy/
- *
- *   Hope it works for you as well as it did for me.
- *
- *   Liam.
+/*
+ * Copyright (c) 2013 ARM Ltd
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
- *
  * 1. Redistributions of source code must retain the above copyright
  *    notice, this list of conditions and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in
- *    the documentation and/or other materials provided with the
- *    distribution.
- * 3. Neither the name NuttX nor the names of its contributors may be
- *    used to endorse or promote products derived from this software
- *    without specific prior written permission.
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the company may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
- * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
- * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
- * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
- * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS
- * OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
- * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
- * ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
- * POSSIBILITY OF SUCH DAMAGE.
- *
- ************************************************************************************/
+ * THIS SOFTWARE IS PROVIDED BY ARM LTD ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL ARM LTD BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED
+ * TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+ * PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
 
-/************************************************************************************
- * Public Symbols
- ************************************************************************************/
+/* This memcpy routine is optimised for Cortex-M3/M4 cores with/without
+   unaligned access.
 
-	.global		memcpy
-	.syntax		unified
-	.thumb
-	.file		"arch_memcpy.S"
+   If compiled with GCC, this file should be enclosed within following
+   pre-processing check:
+   if defined (__ARM_ARCH_7M__) || defined (__ARM_ARCH_7EM__)
 
-/************************************************************************************
- * .text
- ************************************************************************************/
+   Prototype: void *memcpy (void *dst, const void *src, size_t count);
 
+   The job will be done in 5 steps.
+   Step 1: Align src/dest pointers, copy mis-aligned if fail to align both
+   Step 2: Repeatedly copy big block size of __OPT_BIG_BLOCK_SIZE
+   Step 3: Repeatedly copy big block size of __OPT_MID_BLOCK_SIZE
+   Step 4: Copy word by word
+   Step 5: Copy byte-to-byte
+
+   Tunable options:
+     __OPT_BIG_BLOCK_SIZE: Size of big block in words.  Default to 64.
+     __OPT_MID_BLOCK_SIZE: Size of big block in words.  Default to 16.
+ */
+#ifndef __OPT_BIG_BLOCK_SIZE
+#define __OPT_BIG_BLOCK_SIZE (4 * 16)
+#endif
+
+#ifndef __OPT_MID_BLOCK_SIZE
+#define __OPT_MID_BLOCK_SIZE (4 * 4)
+#endif
+
+#if __OPT_BIG_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12
+#elif __OPT_BIG_BLOCK_SIZE == 32
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12,16,20,24,28
+#elif __OPT_BIG_BLOCK_SIZE == 64
+#define BEGIN_UNROLL_BIG_BLOCK \
+  .irp offset, 0,4,8,12,16,20,24,28,32,36,40,44,48,52,56,60
+#else
+#error "Illegal __OPT_BIG_BLOCK_SIZE"
+#endif
+
+#if __OPT_MID_BLOCK_SIZE == 8
+#define BEGIN_UNROLL_MID_BLOCK \
+  .irp offset, 0,4
+#elif __OPT_MID_BLOCK_SIZE == 16
+#define BEGIN_UNROLL_MID_BLOCK \
+  .irp offset, 0,4,8,12
+#else
+#error "Illegal __OPT_MID_BLOCK_SIZE"
+#endif
+
+#define END_UNROLL .endr
+
+	.syntax unified
 	.text
-
-/************************************************************************************
- * Private Constant Data
- ************************************************************************************/
-
-/* We have 16 possible alignment combinations of src and dst, this jump table
- * directs the copy operation
- *
- * Bits:  Src=00, Dst=00 - Long to Long copy
- * Bits:  Src=00, Dst=01 - Long to Byte before half word
- * Bits:  Src=00, Dst=10 - Long to Half word
- * Bits:  Src=00, Dst=11 - Long to Byte before long word
- * Bits:  Src=01, Dst=00 - Byte before half word to long
- * Bits:  Src=01, Dst=01 - Byte before half word to byte before half word -
- *                         Same alignment
- * Bits:  Src=01, Dst=10 - Byte before half word to half word
- * Bits:  Src=01, Dst=11 - Byte before half word to byte before long word
- * Bits:  Src=10, Dst=00 - Half word to long word
- * Bits:  Src=10, Dst=01 - Half word to byte before half word
- * Bits:  Src=10, Dst=10 - Half word to half word - Same Alignment
- * Bits:  Src=10, Dst=11 - Half word to byte before long word
- * Bits:  Src=11, Dst=00 - Byte before long word to long word
- * Bits:  Src=11, Dst=01 - Byte before long word to byte before half word
- * Bits:  Src=11, Dst=11 - Byte before long word to half word
- * Bits:  Src=11, Dst=11 - Byte before long word to Byte before long word -
- *                         Same alignment
- */
-
-MEM_DataCopyTable:
-	.byte	(MEM_DataCopy0 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy1 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy2 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy3 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy4 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy5 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy6 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy7 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy8 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy9 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy10 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy11 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy12 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy13 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy14 - MEM_DataCopyJump) >> 1
-	.byte	(MEM_DataCopy15 - MEM_DataCopyJump) >> 1
-
-	.align 2
-
-MEM_LongCopyTable:
-	.byte	(MEM_LongCopyEnd - MEM_LongCopyJump) >> 1	/* 0 bytes left */
-	.byte	0					/* 4 bytes left */
-	.byte	(1 * 10) >> 1		/* 8 bytes left */
-	.byte	(2 * 10) >> 1		/* 12 bytes left */
-	.byte	(3 * 10) >> 1		/* 16 bytes left */
-	.byte	(4 * 10) >> 1		/* 20 bytes left */
-	.byte	(5 * 10) >> 1		/* 24 bytes left */
-	.byte	(6 * 10) >> 1		/* 28 bytes left */
-	.byte	(7 * 10) >> 1		/* 32 bytes left */
-	.byte	(8 * 10) >> 1		/* 36 bytes left */
-
-	.align 2
-
-/************************************************************************************
- * Public Functions
- ************************************************************************************/
-/************************************************************************************
- * Name: memcpy
- *
- * Description:
- *   Optimized "general" copy routine
- *
- * Input Parameters:
- *   r0 = destination, r1 = source, r2 = length
- *
- * Returned Value:
- *   r0 = destination r1-r3 burned
- *
- ************************************************************************************/
-
-	.align 4
+	.align	2
+	.global	memcpy
+	.thumb
 	.thumb_func
-
+	.type	memcpy, %function
 memcpy:
-	push	{r14}
-	push    {r0}
-	bl      _do_memcpy
-	pop     {r0}
-	pop     {pc}
+	@ r0: dst
+	@ r1: src
+	@ r2: len
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* In case of UNALIGNED access supported, ip is not used in
+	   function body.  */
+	mov	ip, r0
+#else
+	push	{r0}
+#endif
+	orr	r3, r1, r0
+	ands	r3, r3, #3
+	bne	.Lmisaligned_copy
 
-	.align 4
+.Lbig_block:
+	subs	r2, __OPT_BIG_BLOCK_SIZE
+	blo	.Lmid_block
 
-	.thumb_func
-_do_memcpy:
-	push    {r14}
+	/* Kernel loop for big block copy */
+	.align 2
+.Lbig_block_loop:
+	BEGIN_UNROLL_BIG_BLOCK
+#ifdef __ARM_ARCH_7EM__
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+	ldr	r3, [r1, \offset]
+	str	r3, [r0, \offset]
+	END_UNROLL
+	adds	r0, __OPT_BIG_BLOCK_SIZE
+	adds	r1, __OPT_BIG_BLOCK_SIZE
+#endif
+	subs	r2, __OPT_BIG_BLOCK_SIZE
+	bhs .Lbig_block_loop
 
-	/* This allows the inner workings to "assume" a minimum amount of bytes */
-	/* Quickly check for very short copies */
+.Lmid_block:
+	adds	r2, __OPT_BIG_BLOCK_SIZE - __OPT_MID_BLOCK_SIZE
+	blo	.Lcopy_word_by_word
 
-	cmp		r2, #4
-	blt.n	MEM_DataCopyBytes
+	/* Kernel loop for mid-block copy */
+	.align 2
+.Lmid_block_loop:
+	BEGIN_UNROLL_MID_BLOCK
+#ifdef __ARM_ARCH_7EM__
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	END_UNROLL
+#else /* __ARM_ARCH_7M__ */
+	ldr	r3, [r1, \offset]
+	str	r3, [r0, \offset]
+	END_UNROLL
+	adds    r0, __OPT_MID_BLOCK_SIZE
+	adds    r1, __OPT_MID_BLOCK_SIZE
+#endif
+	subs	r2, __OPT_MID_BLOCK_SIZE
+	bhs	.Lmid_block_loop
 
-	and		r14, r0, #3		 		/* Get destination alignment bits */
-	bfi		r14, r1, #2, #2	 		/* Get source alignment bits */
-	ldr		r3, =MEM_DataCopyTable	/* Jump table base */
-	tbb		[r3, r14]		   		/* Perform jump on src/dst alignment bits */
-MEM_DataCopyJump:
+.Lcopy_word_by_word:
+	adds	r2, __OPT_MID_BLOCK_SIZE - 4
+	blo	.Lcopy_less_than_4
 
-	.align 4
+	/* Kernel loop for small block copy */
+	.align 2
+.Lcopy_word_by_word_loop:
+	ldr	r3, [r1], #4
+	str	r3, [r0], #4
+	subs	r2, #4
+	bhs	.Lcopy_word_by_word_loop
 
-/* Bits:  Src=01, Dst=01 - Byte before half word to byte before half word - Same alignment
- * 3 bytes to read for long word aligning
- */
+.Lcopy_less_than_4:
+	adds	r2, #4
+	beq	.Ldone
 
-MEM_DataCopy5:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+	lsls	r2, r2, #31
+	itt ne
+	ldrbne  r3, [r1], #1
+	strbne  r3, [r0], #1
 
-/* Bits:  Src=10, Dst=10 - Half word to half word - Same Alignment
- * 2 bytes to read for long word aligning
- */
+	bcc	.Ldone
+#ifdef __ARM_FEATURE_UNALIGNED
+	ldrh	r3, [r1]
+	strh	r3, [r0]
+#else
+	ldrb	r3, [r1]
+	strb	r3, [r0]
+	ldrb	r3, [r1, #1]
+	strb	r3, [r0, #1]
+#endif /* __ARM_FEATURE_UNALIGNED */
 
-MEM_DataCopy10:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+.Ldone:
+#ifdef __ARM_FEATURE_UNALIGNED
+	mov	r0, ip
+#else
+	pop	{r0}
+#endif
+	bx	lr
 
-/* Bits:  Src=11, Dst=11 - Byte before long word to Byte before long word - Same alignment
- * 1 bytes to read for long word aligning
- */
+	.align 2
+.Lmisaligned_copy:
+#ifdef __ARM_FEATURE_UNALIGNED
+	/* Define label DST_ALIGNED to BIG_BLOCK.  It will go to aligned copy
+	   once destination is adjusted to aligned.  */
+#define Ldst_aligned Lbig_block
 
-MEM_DataCopy15:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+	/* Copy word by word using LDR when alignment can be done in hardware,
+	i.e., SCTLR.A is set, supporting unaligned access in LDR and STR.  */
 
-/* Bits:  Src=00, Dst=00 - Long to Long copy */
+	cmp	r2, #8
+	blo	.Lbyte_copy
 
-MEM_DataCopy0:
-	/* Save regs that may be used by memcpy */
+	/* if src is aligned, just go to the big block loop.  */
+	lsls	r3, r1, #30
+	beq	.Ldst_aligned
+#else
+	/* if len < 12, misalignment adjustment has more overhead than
+	just byte-to-byte copy.  Also, len must >=8 to guarantee code
+	afterward work correctly.  */
+	cmp	r2, #12
+	blo	.Lbyte_copy
+#endif /* __ARM_FEATURE_UNALIGNED */
 
-	push	{r4-r12}
+	/* Align dst only, not trying to align src.  That is the because
+	handling of aligned src and misaligned dst need more overhead than
+	otherwise.  By doing this the worst case is when initial src is aligned,
+	additional up to 4 byte additional copy will executed, which is
+	acceptable.  */
 
-	/* Check for short word-aligned copy */
+	ands	r3, r0, #3
+	beq	.Ldst_aligned
 
-	cmp		r2, #0x28
-	blt.n	MEM_DataCopy0_2
+	rsb	r3, #4
+	subs	r2, r3
 
-	/* Bulk copy loop */
+	lsls    r3, r3, #31
+	itt ne
+	ldrbne  r3, [r1], #1
+	strbne  r3, [r0], #1
 
-MEM_DataCopy0_1:
-	ldmia	r1!, {r3-r12}
-	stmia	r0!, {r3-r12}
-	sub		r2, r2, #0x28
-	cmp		r2, #0x28
-	bge.n	MEM_DataCopy0_1
+	bcc .Ldst_aligned
 
-	/* Copy remaining long words */
+#ifdef __ARM_FEATURE_UNALIGNED
+	ldrh    r3, [r1], #2
+	strh    r3, [r0], #2
+	b	.Ldst_aligned
+#else
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	/* Now that dst is aligned */
+.Ldst_aligned:
+	/* if r1 is aligned now, it means r0/r1 has the same misalignment,
+	and they are both aligned now.  Go aligned copy.  */
+	ands	r3, r1, #3
+	beq	.Lbig_block
 
-MEM_DataCopy0_2:
-	/* Copy remaining long words */
+	/* dst is aligned, but src isn't.  Misaligned copy.  */
 
-	ldr		r14, =MEM_LongCopyTable
-	lsr		r11, r2, #0x02
-	tbb		[r14, r11]
+	push	{r4, r5}
+	subs	r2, #4
 
-	/* longword copy branch table anchor */
+	/* Backward r1 by misaligned bytes, to make r1 aligned.
+	Since we need to restore r1 to unaligned address after the loop,
+	we need keep the offset bytes to ip and sub it from r1 afterward.  */
+	subs	r1, r3
+	rsb	ip, r3, #4
 
-MEM_LongCopyJump:
-	ldr.w	r3, [r1], #0x04		/* 4 bytes remain */
-	str.w	r3, [r0], #0x04
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r4}		/* 8 bytes remain */
-	stmia.w	r0!, {r3-r4}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r5}		/* 12 bytes remain */
-	stmia.w	r0!, {r3-r5}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r6}		/* 16 bytes remain */
-	stmia.w	r0!, {r3-r6}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r7}		/* 20 bytes remain */
-	stmia.w	r0!, {r3-r7}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r8}		/* 24 bytes remain */
-	stmia.w	r0!, {r3-r8}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r9}		/* 28 bytes remain */
-	stmia.w	r0!, {r3-r9}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r10}		/* 32 bytes remain */
-	stmia.w	r0!, {r3-r10}
-	b.n		MEM_LongCopyEnd
-	ldmia.w	r1!, {r3-r11}		/* 36 bytes remain */
-	stmia.w	r0!, {r3-r11}
+	/* Pre-load on word */
+	ldr	r4, [r1], #4
 
-MEM_LongCopyEnd:
-	pop		{r4-r12}
-	and		r2, r2, #0x03		/* All the longs have been copied */
+	cmp	r3, #2
+	beq	.Lmisaligned_copy_2_2
+	cmp	r3, #3
+	beq	.Lmisaligned_copy_3_1
 
-	/* Deal with up to 3 remaining bytes */
+	.macro mis_src_copy shift
+1:
+#ifdef __ARM_BIG_ENDIAN
+	lsls	r4, r4, \shift
+#else
+	lsrs	r4, r4, \shift
+#endif
+	ldr	r3, [r1], #4
+#ifdef __ARM_BIG_ENDIAN
+	lsrs	r5, r3, 32-\shift
+#else
+	lsls	r5, r3, 32-\shift
+#endif
+	orr	r4, r4, r5
+	str	r4, [r0], #4
+	mov	r4, r3
+	subs	r2, #4
+	bhs	1b
+	.endm
 
-MEM_DataCopyBytes:
-	/* Deal with up to 3 remaining bytes */
+.Lmisaligned_copy_1_3:
+	mis_src_copy shift=8
+	b	.Lsrc_misaligned_tail
 
-	cmp		r2, #0x00
-	it		eq
-	popeq	{pc}
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	subs	r2, r2, #0x01
-	it		eq
-	popeq	{pc}
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	subs	r2, r2, #0x01
-	it		eq
-	popeq	{pc}
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	pop		{pc}
+.Lmisaligned_copy_3_1:
+	mis_src_copy shift=24
+	b	.Lsrc_misaligned_tail
 
- .align 4
+.Lmisaligned_copy_2_2:
+	/* For 2_2 misalignment, ldr is still faster than 2 x ldrh.  */
+	mis_src_copy shift=16
 
-/* Bits:  Src=01, Dst=11 - Byte before half word to byte before long word
- * 3 bytes to read for long word aligning the source
- */
+.Lsrc_misaligned_tail:
+	adds	r2, #4
+	subs	r1, ip
+	pop	{r4, r5}
 
-MEM_DataCopy7:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+#endif /* __ARM_FEATURE_UNALIGNED */
 
-/* Bits:  Src=10, Dst=00 - Half word to long word
- * 2 bytes to read for long word aligning the source
- */
+.Lbyte_copy:
+	subs	r2, #4
+	blo	.Lcopy_less_than_4
 
-MEM_DataCopy8:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+.Lbyte_copy_loop:
+	subs    r2, #1
+	ldrb    r3, [r1], #1
+	strb    r3, [r0], #1
+	bhs	.Lbyte_copy_loop
 
-/* Bits:  Src=11, Dst=01 - Byte before long word to byte before half word
- * 1 byte to read for long word aligning the source
- */
+	ldrb	r3, [r1]
+	strb	r3, [r0]
+	ldrb	r3, [r1, #1]
+	strb	r3, [r0, #1]
+	ldrb	r3, [r1, #2]
+	strb	r3, [r0, #2]
 
-MEM_DataCopy13:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
+#ifdef __ARM_FEATURE_UNALIGNED
+	mov	r0, ip
+#else
+	pop	{r0}
+#endif
+	bx	lr
 
-/* Bits:  Src=00, Dst=10 - Long to Half word */
-
-MEM_DataCopy2:
-	cmp		r2, #0x28
-	blt.n	MEM_DataCopy2_1
-
-	/* Save regs */
-
-	push	{r4-r12}
-
-	/* Bulk copy loop */
-
-MEM_DataCopy2_2:
-	ldmia	r1!, {r3-r12}
-
-	strh	r3, [r0], #0x02
-
-	lsr		r3, r3, #0x10
-	bfi		r3, r4, #0x10, #0x10
-	lsr		r4, r4, #0x10
-	bfi		r4, r5, #0x10, #0x10
-	lsr		r5, r5, #0x10
-	bfi		r5, r6, #0x10, #0x10
-	lsr		r6, r6, #0x10
-	bfi		r6, r7, #0x10, #0x10
-	lsr		r7, r7, #0x10
-	bfi		r7, r8, #0x10, #0x10
-	lsr		r8, r8, #0x10
-	bfi		r8, r9, #0x10, #0x10
-	lsr		r9, r9, #0x10
-	bfi		r9, r10, #0x10, #0x10
-	lsr		r10, r10, #0x10
-	bfi		r10, r11, #0x10, #0x10
-	lsr		r11, r11, #0x10
-	bfi		r11, r12, #0x10, #0x10
-	stmia	r0!, {r3-r11}
-	lsr		r12, r12, #0x10
-	strh	r12, [r0], #0x02
-
-	sub		r2, r2, #0x28
-	cmp		r2, #0x28
-	bge.n	MEM_DataCopy2_2
-	pop		{r4-r12}
-
-MEM_DataCopy2_1: /* Read longs and write 2 x half words */
-	cmp		r2, #4
-	blt.n	MEM_DataCopyBytes
-	ldr		r3, [r1], #0x04
-	strh	r3, [r0], #0x02
-	lsr		r3, r3, #0x10
-	strh	r3, [r0], #0x02
-	sub		r2, r2, #0x04
-	b.n		MEM_DataCopy2
-
-/* Bits:  Src=01, Dst=00 - Byte before half word to long
- * Bits:  Src=01, Dst=10 - Byte before half word to half word
- * 3 bytes to read for long word aligning the source
- */
-
-MEM_DataCopy4:
-MEM_DataCopy6:
-	/* Read B and write B */
-
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
-
-/* Bits:  Src=10, Dst=01 - Half word to byte before half word
- * Bits:  Src=10, Dst=11 - Half word to byte before long word
- * 2 bytes to read for long word aligning the source
- */
-
-MEM_DataCopy9:
-MEM_DataCopy11:
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
-
-/* Bits:  Src=11, Dst=00 -chm Byte before long word to long word
- * Bits:  Src=11, Dst=11 - Byte before long word to half word
- * 1 byte to read for long word aligning the source
- */
-
-MEM_DataCopy12:
-MEM_DataCopy14:
-	/* Read B and write B */
-
-	ldrb	r3, [r1], #0x01
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x01
-
-/* Bits:  Src=00, Dst=01 - Long to Byte before half word
- * Bits:  Src=00, Dst=11 - Long to Byte before long word
- */
-
-MEM_DataCopy1: /* Read longs, write B->H->B */
-MEM_DataCopy3:
-	cmp		r2, #4
-	blt		MEM_DataCopyBytes
-	ldr		r3, [r1], #0x04
-	strb	r3, [r0], #0x01
-	lsr		r3, r3, #0x08
-	strh	r3, [r0], #0x02
-	lsr		r3, r3, #0x10
-	strb	r3, [r0], #0x01
-	sub		r2, r2, #0x04
-	b.n		MEM_DataCopy3
-
-	.size	memcpy, .-memcpy
-	.end
+	.size	memcpy, .-memcpy
\ No newline at end of file