libc:machine:xtensa:add xtensa libc implement

N/A Signed-off-by: zhuyanlin <zhuyanlin1@xiaomi.com>
2026-05-20 12:33:27 +08:00 · 2021-10-28 11:56:18 +08:00
parent 580d17cc02
commit cfcff5f570
10 changed files with 2472 additions and 2 deletions
@@ -2,3 +2,46 @@
 # For a description of the syntax of this configuration file,
 # see the file kconfig-language.txt in the NuttX tools repository.
 #
+
+config XTENSA_MEMCPY
+        bool "Enable optimized memcpy() for XTENSA"
+        select LIBC_ARCH_MEMCPY
+        ---help---
+                Enable optimized XTENSA specific memcpy() library function
+
+config XTENSA_MEMMOVE
+        bool "Enable optimized memmove() for XTENSA"
+        select LIBC_ARCH_MEMMOVE
+        ---help---
+                Enable optimized XTENSA specific memmove() library function
+
+config XTENSA_MEMSET
+        bool "Enable optimized memset() for XTENSA"
+        select LIBC_ARCH_MEMSET
+        ---help---
+                Enable optimized XTENSA specific memset() library function
+
+config XTENSA_STRCMP
+        bool "Enable optimized strcmp() for XTENSA"
+        select LIBC_ARCH_STRCMP
+        ---help---
+                Enable optimized XTENSA specific strcmp() library function
+
+config XTENSA_STRCPY
+        bool "Enable optimized strcpy() for XTENSA"
+        select LIBC_ARCH_STRCPY
+        ---help---
+                Enable optimized XTENSA specific strcpy() library function
+
+config XTENSA_STRLEN
+        bool "Enable optimized strlen() for XTENSA"
+        select LIBC_ARCH_STRLEN
+        ---help---
+                Enable optimized XTENSA specific strlen() library function
+
+config XTENSA_STRNCPY
+        bool "Enable optimized strncpy() for XTENSA"
+        select LIBC_ARCH_STRNCPY
+        ---help---
+                Enable optimized XTENSA specific strncpy() library function
+
@@ -19,10 +19,37 @@
 ############################################################################

 ifeq ($(CONFIG_LIBC_ARCH_ELF),y)
-
 CSRCS += arch_elf.c
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMCPY),y)
+ASRCS += arch_memcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMMOVE),y)
+ASRCS += arch_memmove.S
+endif
+
+ifeq ($(CONFIG_XTENSA_MEMSET),y)
+ASRCS += arch_memset.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCPY),y)
+ASRCS += arch_strcpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRLEN),y)
+ASRCS += arch_strlen.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRNCPY),y)
+ASRCS += arch_strncpy.S
+endif
+
+ifeq ($(CONFIG_XTENSA_STRCMP),y)
+ASRCS += arch_strcmp.S
+endif

 DEPPATH += --dep-path machine/xtensa
 VPATH += :machine/xtensa

-endif
@@ -0,0 +1,281 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .literal_position
+
+  .local  .Ldst1mod2
+  .local  .Ldst2mod4
+  .local  .Lbytecopy
+
+  .align  4
+  .global memcpy
+  .type memcpy, @function
+memcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src, a4 = len */
+
+  mov a5, a2    # copy dst so that a2 is return value
+  bbsi.l  a2, 0, .Ldst1mod2
+  bbsi.l  a2, 1, .Ldst2mod4
+.Ldstaligned:
+
+  /* Get number of loop iterations with 16B per iteration.  */
+  srli  a7, a4, 4
+
+  /* Check if source is aligned.  */
+  slli  a8, a3, 30
+  bnez  a8, .Lsrcunaligned
+
+  /* Destination and source are word-aligned, use word copy.  */
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif
+1:  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  s32i  a6, a5, 0
+  l32i  a6, a3, 8
+
+  s32i  a7, a5, 4
+  l32i  a7, a3, 12
+  s32i  a6, a5, 8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a8, 1b
+#endif
+
+  /* Copy any leftover pieces smaller than 16B.  */
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a6, a3, 0
+  l32i  a7, a3, 4
+  addi  a3, a3, 8
+  s32i  a6, a5, 0
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+
+3:  bbsi.l  a4, 2, 4f
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  # .align 4
+  /* Copy 4 bytes.  */
+4:  l32i  a6, a3, 0
+  addi  a3, a3, 4
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l16ui a6, a3, 0
+  addi  a3, a3, 2
+  s16i  a6, a5, 0
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+
+.Ldone:
+  RET(16)
+
+
+/* Destination is aligned; source is unaligned.  */
+
+  # .align 4
+.Lsrcunaligned:
+  /* Avoid loading anything for zero-length copies.  */
+  beqz  a4, .Ldone
+
+  /* Copy 16 bytes per iteration for word-aligned dst and
+     unaligned src.  */
+  ssa8  a3    # set shift amount from byte offset
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  srli    a11, a8, 30     # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, 2f
+#else
+  beqz  a7, 2f
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif
+1:  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5, 8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a10, 1b
+#endif
+
+2:  bbci.l  a4, 3, 3f
+
+  /* Copy 8 bytes.  */
+  l32i  a7, a3, 4
+  l32i  a8, a3, 8
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a3, a3, 8
+  src_b a7, a7, a8
+  s32i  a7, a5, 4
+  addi  a5, a5, 8
+  mov a6, a8
+
+3:  bbci.l  a4, 2, 4f
+
+  /* Copy 4 bytes.  */
+  l32i  a7, a3, 4
+  addi  a3, a3, 4
+  src_b a6, a6, a7
+  s32i  a6, a5, 0
+  addi  a5, a5, 4
+  mov a6, a7
+4:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, 5f
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 2 bytes.  */
+5:  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+  bbsi.l  a4, 0, 6f
+  RET(16)
+
+  /* Copy 1 byte.  */
+6:  l8ui  a6, a3, 0
+  s8i a6, a5, 0
+  RET(16)
+
+
+  # .align XCHAL_INST_FETCH_WIDTH
+__memcpy_aux:
+
+  /* Skip bytes to get proper alignment for three-byte loop */
+# .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, 2f
+#else
+  beqz  a4, 2f
+  add a7, a3, a4  # a7 = end address for source
+#endif
+1:  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bltu  a3, a7, 1b
+#endif
+2:  RET(16)
+
+
+/* Destination is unaligned.  */
+
+  # .align 4
+.Ldst1mod2: # dst is only byte aligned
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 7, .Lbytecopy
+
+  /* Copy 1 byte.  */
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  addi  a4, a4, -1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+
+  /* Return to main algorithm if dst is now aligned.  */
+  bbci.l  a5, 1, .Ldstaligned
+
+.Ldst2mod4: # dst has 16-bit alignment
+
+  /* Do short copies byte-by-byte.  */
+  bltui a4, 6, .Lbytecopy
+
+  /* Copy 2 bytes.  */
+  l8ui  a6, a3, 0
+  l8ui  a7, a3, 1
+  addi  a3, a3, 2
+  addi  a4, a4, -2
+  s8i a6, a5, 0
+  s8i a7, a5, 1
+  addi  a5, a5, 2
+
+  /* dst is now aligned; return to main algorithm.  */
+  j .Ldstaligned
+
+  .end schedule
+
+  .size memcpy, . - memcpy
@@ -0,0 +1,480 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+/* set to 1 when running on ISS (simulator) with the
+   lint or ferret client, or 0 to save a few cycles */
+
+#define SIM_CHECKS_ALIGNMENT  0
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+  .text
+  .begin schedule
+  .global memmove
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbytecopydone
+  add a7, a3, a4  # a7 = end address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lnextbyte:
+  l8ui  a6, a3, 0
+  addi  a3, a3, 1
+  s8i a6, a5, 0
+  addi  a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lnextbyte # continue loop if $a3:src != $a7:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Ldst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  addi  a3, a3,  1
+  addi  a4, a4, -1
+  s8i a6, a5,  0
+  addi  a5, a5,  1
+  _bbci.l a5, 1, .Ldstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Ldst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbytecopy # do short copies byte by byte
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  addi  a4, a4, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  j .Ldstaligned  # dst is now aligned, return to main algorithm
+
+.Lcommon:
+  bbsi.l  a2, 0, .Ldst1mod2 # if dst is 1 mod 2
+  bbsi.l  a2, 1, .Ldst2mod4 # if dst is 2 mod 4
+.Ldstaligned: # return here from .Ldst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lsrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop1done
+  slli  a8, a7, 4
+  add a8, a8, a3  # a8 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1:
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  s32i  a6, a5,  0
+  l32i  a6, a3,  8
+  s32i  a7, a5,  4
+  l32i  a7, a3, 12
+  s32i  a6, a5,  8
+  addi  a3, a3, 16
+  s32i  a7, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .Loop1  # continue loop if a3:src != a8:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop1done:
+  bbci.l  a4, 3, .L2
+  # copy 8 bytes
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a3, a3,  8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+.L2:
+  bbsi.l  a4, 2, .L3
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L3:
+  # copy 4 bytes
+  l32i  a6, a3,  0
+  addi  a3, a3,  4
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  bbsi.l  a4, 1, .L4
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L4:
+  # copy 2 bytes
+  l16ui a6, a3,  0
+  addi  a3, a3,  2
+  s16i  a6, a5,  0
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L5
+  RET(16)
+.L5:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lsrcunaligned:
+  _beqz a4, .Ldone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .Loop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .Loop2done
+  slli  a10, a7, 4
+  add a10, a10, a3  # a10 = end of last 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2:
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  l32i  a9, a3, 12
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  l32i  a6, a3, 16
+  src_b a8, a8, a9
+  s32i  a8, a5,  8
+  addi  a3, a3, 16
+  src_b a9, a9, a6
+  s32i  a9, a5, 12
+  addi  a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .Loop2 # continue loop if a3:src != a10:src_end
+#endif /* !XCHAL_HAVE_LOOPS */
+.Loop2done:
+  bbci.l  a4, 3, .L12
+  # copy 8 bytes
+  l32i  a7, a3,  4
+  l32i  a8, a3,  8
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a3, a3,  8
+  src_b a7, a7, a8
+  s32i  a7, a5,  4
+  addi  a5, a5,  8
+  mov a6, a8
+.L12:
+  bbci.l  a4, 2, .L13
+  # copy 4 bytes
+  l32i  a7, a3,  4
+  addi  a3, a3,  4
+  src_b a6, a6, a7
+  s32i  a6, a5,  0
+  addi  a5, a5,  4
+  mov a6, a7
+.L13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .L14
+  bbsi.l  a4, 0, .L15
+.Ldone: RET(16)
+.L14:
+  # copy 2 bytes
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a3, a3,  2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a5, a5,  2
+  bbsi.l  a4, 0, .L15
+  RET(16)
+.L15:
+  # copy 1 byte
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Byte by byte copy
+ */
+  .align  4
+  .byte 0   # 1 mod 4 alignment for LOOPNEZ
+        # (0 mod 4 alignment for LBEG)
+.Lbackbytecopy:
+#if XCHAL_HAVE_LOOPS
+  loopnez a4, .Lbackbytecopydone
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a4, .Lbackbytecopydone
+  sub a7, a3, a4  # a7 = start address for source
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbacknextbyte:
+  addi  a3, a3, -1
+  l8ui  a6, a3, 0
+  addi  a5, a5, -1
+  s8i a6, a5, 0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a7, .Lbacknextbyte # continue loop if
+               # $a3:src != $a7:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.Lbackbytecopydone:
+  RET(16)
+
+/*
+ * Destination is unaligned
+ */
+
+  .align  4
+.Lbackdst1mod2: # dst is only byte aligned
+  _bltui  a4, 7, .Lbackbytecopy # do short copies byte by byte
+
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  addi  a4, a4, -1
+  _bbci.l a5, 1, .Lbackdstaligned # if dst is now aligned, then
+          # return to main algorithm
+.Lbackdst2mod4: # dst 16-bit aligned
+  # copy 2 bytes
+  _bltui  a4, 6, .Lbackbytecopy # do short copies byte by byte
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  addi  a4, a4, -2
+  j .Lbackdstaligned  # dst is now aligned,
+          # return to main algorithm
+
+  .align  4
+memmove:
+
+  ENTRY(16)
+  # a2/ dst, a3/ src, a4/ len
+  mov a5, a2    # copy dst so that a2 is return value
+.Lmovecommon:
+  sub a6, a5, a3
+  bgeu  a6, a4, .Lcommon
+
+  add a5, a5, a4
+  add a3, a3, a4
+
+  bbsi.l  a5, 0, .Lbackdst1mod2 # if dst is 1 mod 2
+  bbsi.l  a5, 1, .Lbackdst2mod4 # if dst is 2 mod 4
+.Lbackdstaligned: # return here from .Lbackdst?mod? once dst is aligned
+  srli  a7, a4, 4 # number of loop iterations with 16B
+        # per iteration
+  movi  a8, 3   # if source is not aligned,
+  bany  a3, a8, .Lbacksrcunaligned  # then use shifting copy
+  /*
+   * Destination and source are word-aligned, use word copy.
+   */
+  # copy 16 bytes per iteration for word-aligned dst and word-aligned src
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop1done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop1done
+  slli  a8, a7, 4
+  sub a8, a3, a8  # a8 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a6, a3,  8
+  addi  a5, a5, -16
+  s32i  a7, a5, 12
+  l32i  a7, a3,  4
+  s32i  a6, a5,  8
+  l32i  a6, a3,  0
+  s32i  a7, a5,  4
+  s32i  a6, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a8, .backLoop1  # continue loop if a3:src != a8:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop1done:
+  bbci.l  a4, 3, .Lback2
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a6, a3,  0
+  l32i  a7, a3,  4
+  addi  a5, a5, -8
+  s32i  a6, a5,  0
+  s32i  a7, a5,  4
+.Lback2:
+  bbsi.l  a4, 2, .Lback3
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback3:
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a6, a3,  0
+  addi  a5, a5, -4
+  s32i  a6, a5,  0
+  bbsi.l  a4, 1, .Lback4
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback4:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l16ui a6, a3,  0
+  addi  a5, a5, -2
+  s16i  a6, a5,  0
+  bbsi.l  a4, 0, .Lback5
+  RET(16)
+.Lback5:
+  # copy 1 byte
+  addi  a3, a3, -1
+  l8ui  a6, a3,  0
+  addi  a5, a5, -1
+  s8i a6, a5,  0
+  RET(16)
+
+/*
+ * Destination is aligned, Source is unaligned
+ */
+
+  .align  4
+.Lbacksrcunaligned:
+  _beqz a4, .Lbackdone  # avoid loading anything for zero-length copies
+  # copy 16 bytes per iteration for word-aligned dst and unaligned src
+  ssa8  a3    # set shift amount from byte offset
+#define SIM_CHECKS_ALIGNMENT  1 /* set to 1 when running on ISS with
+           * the lint or ferret client, or 0
+           * to save a few cycles */
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  and a11, a3, a8 # save unalignment offset for below
+  sub a3, a3, a11 # align a3
+#endif
+  l32i  a6, a3, 0 # load first word
+#if XCHAL_HAVE_LOOPS
+  loopnez a7, .backLoop2done
+#else /* !XCHAL_HAVE_LOOPS */
+  beqz  a7, .backLoop2done
+  slli  a10, a7, 4
+  sub a10, a3, a10  # a10 = start of first 16B source chunk
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2:
+  addi  a3, a3, -16
+  l32i  a7, a3, 12
+  l32i  a8, a3,  8
+  addi  a5, a5, -16
+  src_b a6, a7, a6
+  s32i  a6, a5, 12
+  l32i  a9, a3,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  8
+  l32i  a6, a3,  0
+  src_b a8, a9, a8
+  s32i  a8, a5,  4
+  src_b a9, a6, a9
+  s32i  a9, a5,  0
+#if !XCHAL_HAVE_LOOPS
+  bne a3, a10, .backLoop2 # continue loop if a3:src != a10:src_start
+#endif /* !XCHAL_HAVE_LOOPS */
+.backLoop2done:
+  bbci.l  a4, 3, .Lback12
+  # copy 8 bytes
+  addi  a3, a3, -8
+  l32i  a7, a3,  4
+  l32i  a8, a3,  0
+  addi  a5, a5, -8
+  src_b a6, a7, a6
+  s32i  a6, a5,  4
+  src_b a7, a8, a7
+  s32i  a7, a5,  0
+  mov a6, a8
+.Lback12:
+  bbci.l  a4, 2, .Lback13
+  # copy 4 bytes
+  addi  a3, a3, -4
+  l32i  a7, a3,  0
+  addi  a5, a5, -4
+  src_b a6, a7, a6
+  s32i  a6, a5,  0
+  mov a6, a7
+.Lback13:
+#if XCHAL_UNALIGNED_LOAD_EXCEPTION || SIM_CHECKS_ALIGNMENT
+  add a3, a3, a11 # readjust a3 with correct misalignment
+#endif
+  bbsi.l  a4, 1, .Lback14
+  bbsi.l  a4, 0, .Lback15
+.Lbackdone:
+  RET(16)
+.Lback14:
+  # copy 2 bytes
+  addi  a3, a3, -2
+  l8ui  a6, a3,  0
+  l8ui  a7, a3,  1
+  addi  a5, a5, -2
+  s8i a6, a5,  0
+  s8i a7, a5,  1
+  bbsi.l  a4, 0, .Lback15
+  RET(16)
+.Lback15:
+  # copy 1 byte
+  addi  a3, a3, -1
+  addi  a5, a5, -1
+  l8ui  a6, a3,  0
+  s8i a6, a5,  0
+  RET(16)
+
+  .end schedule
+  .size memmove, . - memmove
@@ -0,0 +1,179 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_memset.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+/* void *memset (void *dst, int c, size_t length)
+
+   The algorithm is as follows:
+
+   Create a word with c in all byte positions.
+
+   If the destination is aligned, set 16B chunks with a loop, and then
+   finish up with 8B, 4B, 2B, and 1B stores conditional on the length.
+
+   If the destination is unaligned, align it by conditionally
+   setting 1B and/or 2B and then go to aligned case.
+
+   This code tries to use fall-through branches for the common
+   case of an aligned destination (except for the branches to
+   the alignment labels).  */
+
+
+/* Byte-by-byte set.  */
+
+	.section .text
+	.begin schedule
+	.literal_position
+
+	.local	.Lbyteset
+	.local	.Ldst1mod2
+	.local	.Ldst2mod4
+
+	.align	4
+	.global	memset
+	.type	memset, @function
+memset:
+  ENTRY(16)
+	/* a2 = dst, a3 = c, a4 = length */
+
+	/* Duplicate character into all bytes of word.  */
+	extui	a3, a3, 0, 8
+	slli	a7, a3, 8
+	or	a3, a3, a7
+	slli	a7, a3, 16
+	or	a3, a3, a7
+
+	mov	a5, a2		// copy dst so that a2 is return value
+
+	/* Check if dst is unaligned.  */
+	bbsi.l	a2, 0, .Ldst1mod2
+	bbsi.l	a2, 1, .Ldst2mod4
+	j	.Ldstaligned
+
+.Ldst1mod2: // dst is only byte aligned
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+	addi	a5, a5, 1
+	addi	a4, a4, -1
+
+	/* Now retest if dst is aligned.  */
+	bbci.l	a5, 1, .Ldstaligned
+
+.Ldst2mod4: // dst has 16-bit alignment
+
+	/* Do short sizes byte-by-byte.  */
+	bltui	a4, 8, .Lbyteset
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+	addi	a4, a4, -2
+
+	/* dst is now aligned; fall through to main algorithm */
+
+.Ldstaligned:
+
+	/* Get number of loop iterations with 16B per iteration.  */
+	srli	a7, a4, 4
+
+	/* Destination is word-aligned.  */
+#if XCHAL_HAVE_LOOPS
+	loopnez	a7, 2f
+#else
+	beqz	a7, 2f
+	slli	a6, a7, 4
+	add	a6, a6, a5	// a6 = end of last 16B chunk
+#endif
+	/* Set 16 bytes per iteration.  */
+1:	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	s32i	a3, a5, 8
+	s32i	a3, a5, 12
+	addi	a5, a5, 16
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+
+	/* Set any leftover pieces smaller than 16B.  */
+2:	bbci.l	a4, 3, 3f
+
+	/* Set 8 bytes.  */
+	s32i	a3, a5, 0
+	s32i	a3, a5, 4
+	addi	a5, a5, 8
+
+3:	bbci.l	a4, 2, 4f
+
+	/* Set 4 bytes.  */
+	s32i	a3, a5, 0
+	addi	a5, a5, 4
+
+4:	bbci.l	a4, 1, 5f
+
+	/* Set 2 bytes.  */
+	s16i	a3, a5, 0
+	addi	a5, a5, 2
+
+5:	bbci.l	a4, 0, 6f
+
+	/* Set 1 byte.  */
+	s8i	a3, a5, 0
+6:	RET(16)
+
+
+	// .align	XCHAL_INST_FETCH_WIDTH
+__memset_aux:
+
+	/* Skip bytes to get proper alignment for three-byte loop */
+// .skip XCHAL_INST_FETCH_WIDTH - 3
+
+.Lbyteset:
+#if XCHAL_HAVE_LOOPS
+	loopnez	a4, 2f
+#else
+	beqz	a4, 2f
+	add	a6, a5, a4	// a6 = ending address
+#endif
+1:	s8i	a3, a5, 0
+	addi	a5, a5, 1
+#if !XCHAL_HAVE_LOOPS
+	bltu	a5, a6, 1b
+#endif
+2:	RET(16)
+
+	.end schedule
+
+	.size	memset, . - memset
@@ -0,0 +1,243 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strcpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strcpy
+  .type strcpy, @function
+strcpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  /* 1-cycle interlock */
+  s8i a8, a10, 0  # store byte 0
+  beqz  a8, 1f    # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 1  # store byte 0
+  addi  a10, a10, 2 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+1:  RET(16)
+
+
+/* dst is word-aligned; src is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+.Laligned:
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+  addi  a10, a10, 4 # advance dst pointer
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  addi  a10, a10, 4 # advance dst pointer
+.Laligned:
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  bany  a8, a7, 1b  # if byte 3 is zero
+#endif /* !XCHAL_HAVE_LOOPS */
+
+.Lz3: /* Byte 3 is zero.  */
+  RET(16)
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#if 1
+/* For now just use byte copy loop for the unaligned destination case.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  RET(16)
+
+#else /* 0 */
+
+/* This code is not functional yet.  */
+
+.Ldstunaligned:
+  l32i  a9, a2, 0 # load word from dst
+#if XCHAL_HAVE_BE
+  ssa8b a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8l a9    # shift left by byte*8
+#else
+  ssa8l a9    # rotate by dst alignment so that
+  src a9, a9, a9  # shift in loop will put back in place
+  ssa8b a9    # shift left by 32-byte*8
+#endif
+
+/* dst is word-aligned; src is unaligned.  */
+
+.Ldstunalignedloop:
+  l32i  a8, a3, 0 # get word from src
+  /* 1-cycle interlock */
+  bnone a8, a4, .Lu0  # if byte 0 is zero
+  bnone a8, a5, .Lu1  # if byte 1 is zero
+  bnone a8, a6, .Lu2  # if byte 2 is zero
+  src a9, a8, a9  # combine last word and this word
+  s32i  a9, a10, 0  # store word to dst
+  bnone a8, a7, .Lu3  # if byte 3 is nonzero, iterate
+  l32i  a9, a3, 4 # get word from src
+  addi  a3, a3, 8 # advance src pointer
+  bnone a9, a4, .Lu4  # if byte 0 is zero
+  bnone a9, a5, .Lu5  # if byte 1 is zero
+  bnone a9, a6, .Lu6  # if byte 2 is zero
+  src a8, a9, a8  # combine last word and this word
+  s32i  a8, a10, 4  # store word to dst
+  addi  a10, a10, 8 # advance dst pointer
+  bany  a8, a7, .Ldstunalignedloop # if byte 3 is nonzero, iterate
+
+  /* Byte 7 is zero.  */
+.Lu7: RET(16)
+
+.Lu0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  RET(16)
+
+.Lu1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  RET(16)
+
+.Lu2: /* Byte 2 is zero.  */
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  RET(16)
+
+#endif /* 0 */
+  .end schedule
+
+  .size strcpy, . - strcpy
@@ -0,0 +1,123 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strlen.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+  .begin schedule
+  .align  4
+  .literal_position
+  .global strlen
+  .type strlen, @function
+strlen:
+  ENTRY(16)
+  /* a2 = s */
+
+  addi  a3, a2, -4  # because we overincrement at the end
+  movi  a4, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a2, 0, .L1mod2
+  bbsi.l  a2, 1, .L2mod4
+  j .Laligned
+
+.L1mod2: # address is odd
+  l8ui  a8, a3, 4 # get byte 0
+  addi  a3, a3, 1 # advance string pointer
+  beqz  a8, .Lz3  # if byte 0 is zero
+  bbci.l  a3, 1, .Laligned # if string pointer is now word-aligned
+
+.L2mod4: # address is 2 mod 4
+  addi  a3, a3, 2 # advance ptr for aligned access
+  l32i  a8, a3, 0 # get word with first two bytes of string
+  bnone a8, a6, .Lz2  # if byte 2 (of word, not string) is zero
+  bany  a8, a7, .Laligned # if byte 3 (of word, not string) is nonzero
+
+  /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+
+/* String is word-aligned.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, .Lz3  # loop forever (almost anyway)
+#endif
+1:  l32i  a8, a3, 4 # get next word of string
+  addi  a3, a3, 4 # advance string pointer
+  bnone a8, a4, .Lz0  # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+#if XCHAL_HAVE_LOOPS
+  bnone a8, a7, .Lz3  # if byte 3 is zero
+#else
+  bany  a8, a7, 1b  # repeat if byte 3 is non-zero
+#endif
+
+.Lz3: /* Byte 3 is zero.  */
+  addi  a3, a3, 3 # point to zero byte
+  /* Fall through....  */
+
+.Lz0: /* Byte 0 is zero.  */
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz1: /* Byte 1 is zero.  */
+  addi  a3, a3, 1 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+.Lz2: /* Byte 2 is zero.  */
+  addi  a3, a3, 2 # point to zero byte
+  sub a2, a3, a2  # subtract to get length
+  RET(16)
+
+  .end schedule
+
+  .size strlen, . - strlen
@@ -0,0 +1,265 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/arch_strncpy.S
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include "xtensa_asm.h"
+
+#include <arch/chip/core-isa.h>
+#include <arch/xtensa/xtensa_abi.h>
+
+/****************************************************************************
+ * Public Functions
+ ****************************************************************************/
+
+  .section .text
+.begin schedule
+  .align  4
+  .literal_position
+__strncpy_aux:
+
+.Lsrc1mod2: # src address is odd
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a3, a3, 1 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  bbci.l  a3, 1, .Lsrcaligned # if src is now word-aligned
+
+.Lsrc2mod4: # src address is 2 mod 4
+  l8ui  a8, a3, 0 # get byte 0
+  addi  a4, a4, -1  # decrement n
+  s8i a8, a10, 0  # store byte 0
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  beqz  a8, .Lfill  # if byte 0 is zero
+  l8ui  a8, a3, 1 # get byte 0
+  addi  a3, a3, 2 # advance src pointer
+  s8i a8, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, .Lret       # if n is zero
+  addi  a10, a10, 1 # advance dst pointer
+  bnez  a8, .Lsrcaligned
+  j .Lfill
+
+.Lret:
+  RET(16)
+
+  .align  4
+  .global strncpy
+  .type strncpy, @function
+strncpy:
+  ENTRY(16)
+  /* a2 = dst, a3 = src */
+
+  mov a10, a2   # leave dst in return value register
+  beqz    a4, .Lret       # if n is zero
+
+  movi  a11, MASK0
+  movi  a5, MASK1
+  movi  a6, MASK2
+  movi  a7, MASK3
+  bbsi.l  a3, 0, .Lsrc1mod2
+  bbsi.l  a3, 1, .Lsrc2mod4
+.Lsrcaligned:
+
+  /* Check if the destination is aligned.  */
+  movi  a8, 3
+  bnone a10, a8, .Laligned
+
+  j .Ldstunaligned
+
+
+/* Fill the dst with zeros -- n is at least 1.  */
+
+.Lfill:
+  movi  a9, 0
+  bbsi.l  a10, 0, .Lfill1mod2
+  bbsi.l  a10, 1, .Lfill2mod4
+.Lfillaligned:
+  blti  a4, 4, .Lfillcleanup
+
+  /* Loop filling complete words with zero.  */
+#if XCHAL_HAVE_LOOPS
+
+  srai  a8, a4, 2
+  loop  a8, 1f
+  s32i  a9, a10, 0
+  addi  a10, a10, 4
+
+1:  slli  a8, a8, 2
+  sub a4, a4, a8
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  s32i  a9, a10, 0
+  addi  a10, a10, 4
+  addi  a4, a4, -4
+  bgei    a4, 4, 1b
+
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  beqz  a4, 2f
+
+.Lfillcleanup:
+  /* Fill leftover (1 to 3) bytes with zero.  */
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1
+  bnez    a4, .Lfillcleanup
+
+2:  RET(16)
+
+.Lfill1mod2: # dst address is odd
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 1 # advance dst pointer
+  bbci.l  a10, 1, .Lfillaligned # if dst is now word-aligned
+
+.Lfill2mod4: # dst address is 2 mod 4
+  s8i a9, a10, 0  # store byte 0
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  s8i a9, a10, 1  # store byte 1
+  addi  a4, a4, -1  # decrement n
+  beqz    a4, 2b    # if n is zero
+  addi    a10, a10, 2 # advance dst pointer
+  j .Lfillaligned
+
+
+/* dst is word-aligned; src is word-aligned; n is at least 1.  */
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Laligned:
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 1f    # loop forever (almost anyway)
+  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bnone a8, a7, .Lfill  # if byte 3 is zero
+1:
+
+#else /* !XCHAL_HAVE_LOOPS */
+
+1:  blti  a4, 5, .Ldstunaligned # n is near limit; do one at a time
+  l32i  a8, a3, 0 # get word from src
+  addi  a3, a3, 4 # advance src pointer
+  bnone a8, a11, .Lz0 # if byte 0 is zero
+  bnone a8, a5, .Lz1  # if byte 1 is zero
+  bnone a8, a6, .Lz2  # if byte 2 is zero
+  s32i  a8, a10, 0  # store word to dst
+  addi  a4, a4, -4  # decrement n
+  addi  a10, a10, 4 # advance dst pointer
+  bany  a8, a7, 1b  # no zeroes
+#endif /* !XCHAL_HAVE_LOOPS */
+
+  j .Lfill
+
+.Lz0: /* Byte 0 is zero.  */
+#if XCHAL_HAVE_BE
+  movi  a8, 0
+#endif
+  s8i a8, a10, 0
+  addi  a4, a4, -1  # decrement n
+  addi  a10, a10, 1 # advance dst pointer
+  j .Lfill
+
+.Lz1: /* Byte 1 is zero.  */
+#if XCHAL_HAVE_BE
+        extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  addi  a4, a4, -2  # decrement n
+  addi  a10, a10, 2 # advance dst pointer
+  j .Lfill
+
+.Lz2: /* Byte 2 is zero.  */
+#if XCHAL_HAVE_BE
+  extui   a8, a8, 16, 16
+#endif
+  s16i  a8, a10, 0
+  movi  a8, 0
+  s8i a8, a10, 2
+  addi  a4, a4, -3  # decrement n
+  addi  a10, a10, 3 # advance dst pointer
+  j .Lfill
+
+  .align  4
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  /* (2 mod 4) alignment for loop instruction */
+#else
+  /* (1 mod 4) alignment for loop instruction */
+  .byte 0
+  .byte 0
+#endif
+#endif
+.Ldstunaligned:
+
+#if XCHAL_HAVE_LOOPS
+#if XCHAL_HAVE_DENSITY
+  _movi.n a8, 0   # set up for the maximum loop count
+#else
+  _movi a8, 0   # set up for the maximum loop count
+#endif
+  loop  a8, 2f    # loop forever (almost anyway)
+#endif
+1:  l8ui  a8, a3, 0
+  addi  a3, a3, 1
+  s8i a8, a10, 0
+  addi  a4, a4, -1
+  beqz  a4, 3f
+  addi  a10, a10, 1
+#if XCHAL_HAVE_LOOPS
+  beqz  a8, 2f
+#else
+  bnez  a8, 1b
+#endif
+2:  j .Lfill
+
+3:  RET(16)
+.end schedule
+
+  .size strncpy, . - strncpy
@@ -0,0 +1,62 @@
+/****************************************************************************
+ * libs/libc/machine/xtensa/xtensa_asm.h
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.  The
+ * ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the
+ * License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.  See the
+ * License for the specific language governing permissions and limitations
+ * under the License.
+ *
+ ****************************************************************************/
+
+/****************************************************************************
+ * Included Files
+ ****************************************************************************/
+
+#include <arch/chip/core-isa.h>
+
+/****************************************************************************
+ * Assembly Language Macros
+ ****************************************************************************/
+
+  .macro  src_b r, w0, w1
+#if XCHAL_HAVE_BE
+  src \r, \w0, \w1
+#else
+  src \r, \w1, \w0
+#endif
+  .endm
+
+  .macro  ssa8  r
+#if XCHAL_HAVE_BE
+  ssa8b \r
+#else
+  ssa8l \r
+#endif
+  .endm
+
+/****************************************************************************
+ * Pre-processor Macros
+ ****************************************************************************/
+
+#if XCHAL_HAVE_BE
+#  define MASK0 0xff000000
+#  define MASK1 0x00ff0000
+#  define MASK2 0x0000ff00
+#  define MASK3 0x000000ff
+#else
+#  define MASK0 0x000000ff
+#  define MASK1 0x0000ff00
+#  define MASK2 0x00ff0000
+#  define MASK3 0xff000000
+#endif
+