libc/arm: optimize crc32/crc32c for arm

Optimize crc32 standard(poly:0x04C11DB7) and crc32
castagnoli(poly:0x1EDC6F41) with arm crc32 extension instructions.

For example, crc32 standard caculates(lookup crc32 table) 1812 bytes data,
reduced the time from 118 us to 14 us through optimization.

Performance improved ~700%

Signed-off-by: Jinliang Li <lijinliang1@lixiang.com>
This commit is contained in:
Jinliang Li
2025-03-19 22:48:47 +08:00
committed by Alan C. Assis
parent 586d216a40
commit d5db7d1cee
6 changed files with 223 additions and 68 deletions
+6
View File
@@ -434,6 +434,12 @@ config ARCH_HAVE_FORK
bool bool
default n default n
config ARCH_HAVE_CRC32
bool
default n
---help---
Architecture supports CRC32 instruction
config ARCH_HAVE_FPU config ARCH_HAVE_FPU
bool bool
default n default n
+4
View File
@@ -68,4 +68,8 @@ if(CONFIG_PROFILE_MINI)
list(APPEND SRCS gnu/mcount.S) list(APPEND SRCS gnu/mcount.S)
endif() endif()
if(CONFIG_LIBC_ARCH_CRC32)
list(APPEND SRCS arch_crc32.c)
endif()
target_sources(c PRIVATE ${SRCS}) target_sources(c PRIVATE ${SRCS})
+7
View File
@@ -31,3 +31,10 @@ endif
if ARCH_ARMV8R if ARCH_ARMV8R
source "libs/libc/machine/arm/armv8-r/Kconfig" source "libs/libc/machine/arm/armv8-r/Kconfig"
endif endif
config LIBC_ARCH_CRC32
bool "Enable optimized crc32 for ARM"
default n
depends on ARCH_HAVE_CRC32
---help---
Enable optimized arm neon specific crc32 library function
+4
View File
@@ -66,6 +66,10 @@ ifeq ($(CONFIG_PROFILE_MINI),y)
ASRCS += mcount.S ASRCS += mcount.S
endif endif
ifeq ($(CONFIG_LIBC_ARCH_CRC32),y)
CSRCS += arch_crc32.c
endif
ifeq ($(CONFIG_ARCH_TOOLCHAIN_GNU),y) ifeq ($(CONFIG_ARCH_TOOLCHAIN_GNU),y)
DEPPATH += --dep-path machine/arm/gnu DEPPATH += --dep-path machine/arm/gnu
VPATH += :machine/arm/gnu VPATH += :machine/arm/gnu
+115
View File
@@ -0,0 +1,115 @@
/****************************************************************************
* libs/libc/machine/arm/arch_crc32.c
*
* SPDX-License-Identifier: Apache-2.0
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership. The
* ASF licenses this file to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance with the
* License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
* WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
* License for the specific language governing permissions and limitations
* under the License.
*
****************************************************************************/
/****************************************************************************
* Included Files
****************************************************************************/
#include <stdint.h>
#include <sys/types.h>
#include <arm_acle.h>
/****************************************************************************
* Public Functions
****************************************************************************/
/****************************************************************************
* Name: crc32part
*
* Description:
* crc32 polynomial 0x04C11DB7 (bitreflected 0xEDB88320)
*
****************************************************************************/
uint32_t crc32part(FAR const uint8_t *src, size_t len, uint32_t crc32val)
{
size_t i = 0;
for (; (i + 8) <= len; i += 8)
{
uint64_t data = *(uint64_t *)(src + i);
crc32val = __crc32d(crc32val, data);
}
if (i + 4 <= len)
{
uint32_t data = *(uint32_t *)(src + i);
crc32val = __crc32w(crc32val, data);
i += 4;
}
if (i + 2 <= len)
{
uint16_t data = *(uint16_t *)(src + i);
crc32val = __crc32h(crc32val, data);
i += 2;
}
if (i < len)
{
uint8_t data = *(uint8_t *)(src + i);
crc32val = __crc32b(crc32val, data);
}
return crc32val;
}
/****************************************************************************
* Name: crc32part_c
*
* Description:
* crc32 Castagnoli polynomial 0x1EDC6F41
*
****************************************************************************/
uint32_t crc32part_c(FAR const uint8_t *src, size_t len, uint32_t crc32val)
{
size_t i = 0;
for (; (i + 8) <= len; i += 8)
{
uint64_t data = *(uint64_t *)(src + i);
crc32val = __crc32cd(crc32val, data);
}
if (i + 4 <= len)
{
uint32_t data = *(uint32_t *)(src + i);
crc32val = __crc32cw(crc32val, data);
i += 4;
}
if (i + 2 <= len)
{
uint16_t data = *(uint16_t *)(src + i);
crc32val = __crc32ch(crc32val, data);
i += 2;
}
if (i < len)
{
uint8_t data = *(uint8_t *)(src + i);
crc32val = __crc32cb(crc32val, data);
}
return crc32val;
}
+87 -68
View File
@@ -1,4 +1,4 @@
/************************************************************************************************ /****************************************************************************
* libs/libc/misc/lib_crc32.c * libs/libc/misc/lib_crc32.c
* *
* SPDX-License-Identifier: Apache-2.0 * SPDX-License-Identifier: Apache-2.0
@@ -18,110 +18,127 @@
* License for the specific language governing permissions and limitations * License for the specific language governing permissions and limitations
* under the License. * under the License.
* *
***********************************************************************************************/ ****************************************************************************/
/* The logic in this file was developed by Gary S. Brown: /* The logic in this file was developed by Gary S. Brown:
* *
* COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or code or tables * COPYRIGHT (C) 1986 Gary S. Brown. You may use this program, or code or
* extracted from it, as desired without restriction. * tables extracted from it, as desired without restriction.
* *
* First, the polynomial itself and its table of feedback terms. The polynomial is: * First, the polynomial itself and its table of feedback terms. The
* polynomial is:
* *
* X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0 * X^32+X^26+X^23+X^22+X^16+X^12+X^11+X^10+X^8+X^7+X^5+X^4+X^2+X^1+X^0
* *
* Note that we take it "backwards" and put the highest-order term in the lowest-order bit. * Note that we take it "backwards" and put the highest-order term in the
* The X^32 term is "implied"; the LSB is the X^31 term, etc. The X^0 term (usually shown * lowest-order bit. The X^32 term is "implied"; the LSB is the X^31 term,
* as "+1") results in the MSB being 1 * etc. The X^0 term (usually shown as "+1") results in the MSB being 1
* *
* Note that the usual hardware shift register implementation, which is what we're using * Note that the usual hardware shift register implementation, which is what
* (we're merely optimizing it by doing eight-bit chunks at a time) shifts bits into the * we're using (we're merely optimizing it by doing eight-bit chunks at a
* lowest-order term. In our implementation, that means shifting towards the right. Why * time) shifts bits into the lowest-order term. In our implementation, that
* do we do it this way? Because the calculated CRC must be transmitted in order from * means shifting towards the right. Why do we do it this way? Because the
* highest-order term to lowest-order term. UARTs transmit characters in order from LSB * calculated CRC must be transmitted in order from
* to MSB. By storing the CRC this way we hand it to the UART in the order low-byte to * highest-order term to lowest-order term. UARTs transmit characters in
* high-byte; the UART sends each low-bit to hight-bit; and the result is transmission bit * order from LSB to MSB. By storing the CRC this way we hand it to the UART
* by bit from highest- to lowest-order term without requiring any bit shuffling on our * in the order low-byte to high-byte; the UART sends each low-bit to
* part. Reception works similarly * hight-bit; and the result is transmission bit by bit from highest- to
* lowest-order term without requiring any bit shuffling on our part.
* Reception works similarly
* *
* The feedback terms table consists of 256, 32-bit entries. Notes * The feedback terms table consists of 256, 32-bit entries. Notes
* *
* - The table can be generated at runtime if desired; code to do so is shown later. It * - The table can be generated at runtime if desired; code to do so is shown
* might not be obvious, but the feedback terms simply represent the results of eight * later. It might not be obvious, but the feedback terms simply represent
* shift/xor operations for all combinations of data and CRC register values * the results of eight shift/xor operations for all combinations of data
* and CRC register values
* *
* - The values must be right-shifted by eight bits by the updcrc logic; the shift must * - The values must be right-shifted by eight bits by the updcrc logic; the
* be u_(bring in zeroes). On some hardware you could probably optimize the shift in * shift must be u_(bring in zeroes). On some hardware you could probably
* assembler by using byte-swap instructions polynomial $edb88320 * optimize the shift in assembler by using byte-swap instructions
************************************************************************************************/ * polynomial $edb88320
****************************************************************************/
/************************************************************************************************ /****************************************************************************
* Included Files * Included Files
************************************************************************************************/ ****************************************************************************/
#include <sys/types.h> #include <sys/types.h>
#include <stdint.h> #include <stdint.h>
#include <nuttx/crc32.h> #include <nuttx/crc32.h>
/************************************************************************************************ /****************************************************************************
* Private Data * Private Data
************************************************************************************************/ ****************************************************************************/
#ifndef CONFIG_LIBC_ARCH_CRC32
#ifdef CONFIG_LIBC_CRC32_SLOW #ifdef CONFIG_LIBC_CRC32_SLOW
# define LIBC_CRC32_POLY 0xedb88320 # define LIBC_CRC32_POLY 0xedb88320
#else #else
static const uint32_t crc32_tab[] = static const uint32_t crc32_tab[] =
{ {
0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f, 0xe963a535, 0x9e6495a3, 0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0xe963a535, 0x9e6495a3, 0x0edb8832, 0x79dcb8a4, 0xe0d5e91e, 0x97d2d988,
0x1db71064, 0x6ab020f2, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7, 0x09b64c2b, 0x7eb17cbd, 0xe7b82d07, 0x90bf1d91, 0x1db71064, 0x6ab020f2,
0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9, 0xfa0f3d63, 0x8d080df5, 0xf3b97148, 0x84be41de, 0x1adad47d, 0x6ddde4eb, 0xf4d4b551, 0x83d385c7,
0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x136c9856, 0x646ba8c0, 0xfd62f97a, 0x8a65c9ec, 0x14015c4f, 0x63066cd9,
0x35b5a8fa, 0x42b2986c, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59, 0xfa0f3d63, 0x8d080df5, 0x3b6e20c8, 0x4c69105e, 0xd56041e4, 0xa2677172,
0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423, 0xcfba9599, 0xb8bda50f, 0x3c03e4d1, 0x4b04d447, 0xd20d85fd, 0xa50ab56b, 0x35b5a8fa, 0x42b2986c,
0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0xdbbbc9d6, 0xacbcf940, 0x32d86ce3, 0x45df5c75, 0xdcd60dcf, 0xabd13d59,
0x76dc4190, 0x01db7106, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433, 0x26d930ac, 0x51de003a, 0xc8d75180, 0xbfd06116, 0x21b4f4b5, 0x56b3c423,
0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d, 0x91646c97, 0xe6635c01, 0xcfba9599, 0xb8bda50f, 0x2802b89e, 0x5f058808, 0xc60cd9b2, 0xb10be924,
0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x2f6f7c87, 0x58684c11, 0xc1611dab, 0xb6662d3d, 0x76dc4190, 0x01db7106,
0x65b0d9c6, 0x12b7e950, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65, 0x98d220bc, 0xefd5102a, 0x71b18589, 0x06b6b51f, 0x9fbfe4a5, 0xe8b8d433,
0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7, 0xa4d1c46d, 0xd3d6f4fb, 0x7807c9a2, 0x0f00f934, 0x9609a88e, 0xe10e9818, 0x7f6a0dbb, 0x086d3d2d,
0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x91646c97, 0xe6635c01, 0x6b6b51f4, 0x1c6c6162, 0x856530d8, 0xf262004e,
0x5005713c, 0x270241aa, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f, 0x6c0695ed, 0x1b01a57b, 0x8208f4c1, 0xf50fc457, 0x65b0d9c6, 0x12b7e950,
0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81, 0xb7bd5c3b, 0xc0ba6cad, 0x8bbeb8ea, 0xfcb9887c, 0x62dd1ddf, 0x15da2d49, 0x8cd37cf3, 0xfbd44c65,
0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0x4db26158, 0x3ab551ce, 0xa3bc0074, 0xd4bb30e2, 0x4adfa541, 0x3dd895d7,
0xe3630b12, 0x94643b84, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1, 0xa4d1c46d, 0xd3d6f4fb, 0x4369e96a, 0x346ed9fc, 0xad678846, 0xda60b8d0,
0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb, 0x196c3671, 0x6e6b06e7, 0x44042d73, 0x33031de5, 0xaa0a4c5f, 0xdd0d7cc9, 0x5005713c, 0x270241aa,
0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xbe0b1010, 0xc90c2086, 0x5768b525, 0x206f85b3, 0xb966d409, 0xce61e49f,
0xd6d6a3e8, 0xa1d1937e, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b, 0x5edef90e, 0x29d9c998, 0xb0d09822, 0xc7d7a8b4, 0x59b33d17, 0x2eb40d81,
0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55, 0x316e8eef, 0x4669be79, 0xb7bd5c3b, 0xc0ba6cad, 0xedb88320, 0x9abfb3b6, 0x03b6e20c, 0x74b1d29a,
0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xead54739, 0x9dd277af, 0x04db2615, 0x73dc1683, 0xe3630b12, 0x94643b84,
0xc5ba3bbe, 0xb2bd0b28, 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d, 0x0d6d6a3e, 0x7a6a5aa8, 0xe40ecf0b, 0x9309ff9d, 0x0a00ae27, 0x7d079eb1,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f, 0x72076785, 0x05005713, 0xf00f9344, 0x8708a3d2, 0x1e01f268, 0x6906c2fe, 0xf762575d, 0x806567cb,
0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38, 0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x196c3671, 0x6e6b06e7, 0xfed41b76, 0x89d32be0, 0x10da7a5a, 0x67dd4acc,
0x86d3d2d4, 0xf1d4e242, 0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777, 0xf9b9df6f, 0x8ebeeff9, 0x17b7be43, 0x60b08ed5, 0xd6d6a3e8, 0xa1d1937e,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69, 0x616bffd3, 0x166ccf45, 0x38d8c2c4, 0x4fdff252, 0xd1bb67f1, 0xa6bc5767, 0x3fb506dd, 0x48b2364b,
0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2, 0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xd80d2bda, 0xaf0a1b4c, 0x36034af6, 0x41047a60, 0xdf60efc3, 0xa867df55,
0xaed16a4a, 0xd9d65adc, 0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9, 0x316e8eef, 0x4669be79, 0xcb61b38c, 0xbc66831a, 0x256fd2a0, 0x5268e236,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693, 0x54de5729, 0x23d967bf, 0xcc0c7795, 0xbb0b4703, 0x220216b9, 0x5505262f, 0xc5ba3bbe, 0xb2bd0b28,
0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d 0x2bb45a92, 0x5cb36a04, 0xc2d7ffa7, 0xb5d0cf31, 0x2cd99e8b, 0x5bdeae1d,
0x9b64c2b0, 0xec63f226, 0x756aa39c, 0x026d930a, 0x9c0906a9, 0xeb0e363f,
0x72076785, 0x05005713, 0x95bf4a82, 0xe2b87a14, 0x7bb12bae, 0x0cb61b38,
0x92d28e9b, 0xe5d5be0d, 0x7cdcefb7, 0x0bdbdf21, 0x86d3d2d4, 0xf1d4e242,
0x68ddb3f8, 0x1fda836e, 0x81be16cd, 0xf6b9265b, 0x6fb077e1, 0x18b74777,
0x88085ae6, 0xff0f6a70, 0x66063bca, 0x11010b5c, 0x8f659eff, 0xf862ae69,
0x616bffd3, 0x166ccf45, 0xa00ae278, 0xd70dd2ee, 0x4e048354, 0x3903b3c2,
0xa7672661, 0xd06016f7, 0x4969474d, 0x3e6e77db, 0xaed16a4a, 0xd9d65adc,
0x40df0b66, 0x37d83bf0, 0xa9bcae53, 0xdebb9ec5, 0x47b2cf7f, 0x30b5ffe9,
0xbdbdf21c, 0xcabac28a, 0x53b39330, 0x24b4a3a6, 0xbad03605, 0xcdd70693,
0x54de5729, 0x23d967bf, 0xb3667a2e, 0xc4614ab8, 0x5d681b02, 0x2a6f2b94,
0xb40bbe37, 0xc30c8ea1, 0x5a05df1b, 0x2d02ef8d
}; };
#endif #endif
/************************************************************************************************ /****************************************************************************
* Public Functions * Public Functions
************************************************************************************************/ ****************************************************************************/
/************************************************************************************************ /****************************************************************************
* Name: crc32part * Name: crc32part
* *
* Description: * Description:
* Continue CRC calculation on a part of the buffer. * Continue CRC calculation on a part of the buffer.
* *
************************************************************************************************/ ****************************************************************************/
uint32_t crc32part(FAR const uint8_t *src, size_t len, uint32_t crc32val) uint32_t crc32part(FAR const uint8_t *src, size_t len, uint32_t crc32val)
{ {
size_t i; size_t i;
#ifdef CONFIG_LIBC_CRC32_SLOW #ifdef CONFIG_LIBC_CRC32_SLOW
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
{ {
@@ -149,13 +166,15 @@ uint32_t crc32part(FAR const uint8_t *src, size_t len, uint32_t crc32val)
return crc32val; return crc32val;
} }
/************************************************************************************************ #endif /* CONFIG_LIBC_ARCH_CRC32 */
/****************************************************************************
* Name: crc32 * Name: crc32
* *
* Description: * Description:
* Return a 32-bit CRC of the contents of the 'src' buffer, length 'len' * Return a 32-bit CRC of the contents of the 'src' buffer, length 'len'
* *
************************************************************************************************/ ****************************************************************************/
uint32_t crc32(FAR const uint8_t *src, size_t len) uint32_t crc32(FAR const uint8_t *src, size_t len)
{ {