+ * 5 | ...
+ */
+
+ /*
+ * load {d2a, d2b, 0, 0, d2a, d2b, 0, 0}
+ */
+ stateVec1 = vldrhq_gather_shifted_offset((float16_t const *) scratch, reshufledIdxVec);
+ stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
+ stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
+ *(f16x8_t *) scratch = stateVec1;
+
+ pIn = pIn + 2;
+ sample--;
+ }
+
+ /*
+ * Store the updated state variables back into the state array
+ */
+ *pState++ = vgetq_lane(stateVec1, 0);
+ *pState++ = vgetq_lane(stateVec1, 1);
+ *pState++ = vgetq_lane(stateVec1, 2);
+ *pState++ = vgetq_lane(stateVec1, 3);
+
+ /*
+ * The current stage input is given as the output to the next stage
+ */
+ pIn = pDst;
+ /*
+ * Reset the output working pointer
+ */
+ pOut = pDst;
+ /*
+ * decrement the loop counter
+ */
+ stage--;
+ }
+ while (stage > 0U);
+}
+#else
+LOW_OPTIMIZATION_ENTER
+void arm_biquad_cascade_stereo_df2T_f16(
+ const arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ const float16_t *pIn = pSrc; /* Source pointer */
+ float16_t *pOut = pDst; /* Destination pointer */
+ float16_t *pState = S->pState; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ _Float16 acc1a, acc1b; /* Accumulator */
+ _Float16 b0, b1, b2, a1, a2; /* Filter coefficients */
+ _Float16 Xn1a, Xn1b; /* Temporary input */
+ _Float16 d1a, d2a, d1b, d2b; /* State variables */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = pCoeffs[0];
+ b1 = pCoeffs[1];
+ b2 = pCoeffs[2];
+ a1 = pCoeffs[3];
+ a2 = pCoeffs[4];
+
+ /* Reading the state values */
+ d1a = pState[0];
+ d2a = pState[1];
+ d1b = pState[2];
+ d2b = pState[3];
+
+ pCoeffs += 5U;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 8 outputs at a time */
+ sample = blockSize >> 3U;
+
+ while (sample > 0U) {
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+/* 1 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 2 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 3 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 4 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 5 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 6 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 7 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 8 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ sample = blockSize & 0x7U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (sample > 0U) {
+ /* Read the input */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ /* y[n] = b0 * x[n] + d1 */
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ /* Every time after the output is computed state should be updated. */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ /* d2 = b2 * x[n] + a2 * y[n] */
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Store the updated state variables back into the state array */
+ pState[0] = d1a;
+ pState[1] = d2a;
+
+ pState[2] = d1b;
+ pState[3] = d2b;
+
+ pState += 4U;
+
+ /* The current stage output is given as the input to the next stage */
+ pIn = pDst;
+
+ /* Reset the output working pointer */
+ pOut = pDst;
+
+ /* Decrement the loop counter */
+ stage--;
+
+ } while (stage > 0U);
+
+}
+LOW_OPTIMIZATION_EXIT
+#endif /* #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
new file mode 100644
index 0000000000..ee0ac093b7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_f32.c
@@ -0,0 +1,420 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_biquad_cascade_stereo_df2T_f32.c
+ * Description: Processing function for floating-point transposed direct form II Biquad cascade filter. 2 channels
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+*/
+
+/**
+ @addtogroup BiquadCascadeDF2T
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in] S points to an instance of the filter data structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+#if (defined(ARM_MATH_MVEF) && defined(ARM_MATH_HELIUM_EXPERIMENTAL)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+
+void arm_biquad_cascade_stereo_df2T_f32(
+ const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* source pointer */
+ float32_t *pOut = pDst; /* destination pointer */
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
+ float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+ float32_t scratch[6];
+ uint32x4_t loadIdxVec;
+ f32x4_t aCoeffs, bCoeffs;
+ f32x4_t stateVec0, stateVec1;
+ f32x4_t inVec;
+ uint32_t startIdx = 0;
+
+ /*
+ * {0, 1, 0, 1} generator
+ */
+ loadIdxVec = viwdupq_u32(&startIdx, 2, 1);
+
+ /*
+ * scratch top clearing
+ * layout : [d1a d1b d2a d2b 0 0]
+ */
+ scratch[4] = 0.0f;
+ scratch[5] = 0.0f;
+
+ do
+ {
+ /*
+ * Reading the coefficients
+ */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /*
+ * aCoeffs = {a1 a1 a2 a2}
+ */
+ aCoeffs = vdupq_n_f32(a1);
+ aCoeffs = vsetq_lane(a2, aCoeffs, 2);
+ aCoeffs = vsetq_lane(a2, aCoeffs, 3);
+
+ /*
+ * bCoeffs = {b1 b1 b2 b2}
+ */
+ bCoeffs = vdupq_n_f32(b1);
+ bCoeffs = vsetq_lane(b2, bCoeffs, 2);
+ bCoeffs = vsetq_lane(b2, bCoeffs, 3);
+
+ /*
+ * Reading the state values
+ * Save into scratch
+ */
+ *(f32x4_t *) scratch = *(f32x4_t *) pState;
+
+ sample = blockSize;
+
+ while (sample > 0U)
+ {
+ /*
+ * step 1
+ *
+ * 0 | acc1a = xn1a * b0 + d1a
+ * 1 | acc1b = xn1b * b0 + d1b
+ * 2 | acc1a = xn1a * b0 + d1a
+ * 3 | acc1b = xn1b * b0 + d1b
+ */
+ /*
+ * load {d1a, d1b, d1a, d1b}
+ */
+ stateVec0 = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) scratch, loadIdxVec);
+ /*
+ * load {in0 in1 in0 in1}
+ */
+ inVec = (f32x4_t)vldrwq_gather_shifted_offset((uint32_t const *) pIn, loadIdxVec);
+
+ stateVec0 = vfmaq(stateVec0, inVec, b0);
+ *pOut++ = vgetq_lane(stateVec0, 0);
+ *pOut++ = vgetq_lane(stateVec0, 1);
+
+ /*
+ * step 2
+ *
+ * 0 | d1a = b1 * xn1a + a1 * acc1a + d2a
+ * 1 | d1b = b1 * xn1b + a1 * acc1b + d2b
+ * 2 | d2a = b2 * xn1a + a2 * acc1a + 0
+ * 3 | d2b = b2 * xn1b + a2 * acc1b + 0
+ */
+
+ /*
+ * load {d2a, d2b, 0, 0}
+ */
+ stateVec1 = *(f32x4_t *) & scratch[2];
+ stateVec1 = vfmaq(stateVec1, stateVec0, aCoeffs);
+ stateVec1 = vfmaq(stateVec1, inVec, bCoeffs);
+ *(f32x4_t *) scratch = stateVec1;
+
+ pIn = pIn + 2;
+ sample--;
+ }
+
+ /*
+ * Store the updated state variables back into the state array
+ */
+ vst1q(pState, stateVec1);
+ pState += 4;
+
+ /*
+ * The current stage output is given as the input to the next stage
+ */
+ pIn = pDst;
+ /*
+ * Reset the output working pointer
+ */
+ pOut = pDst;
+ /*
+ * decrement the loop counter
+ */
+ stage--;
+ }
+ while (stage > 0U);
+}
+
+#else
+LOW_OPTIMIZATION_ENTER
+void arm_biquad_cascade_stereo_df2T_f32(
+ const arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Source pointer */
+ float32_t *pOut = pDst; /* Destination pointer */
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t acc1a, acc1b; /* Accumulator */
+ float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ float32_t Xn1a, Xn1b; /* Temporary input */
+ float32_t d1a, d2a, d1b, d2b; /* State variables */
+ uint32_t sample, stage = S->numStages; /* Loop counters */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = pCoeffs[0];
+ b1 = pCoeffs[1];
+ b2 = pCoeffs[2];
+ a1 = pCoeffs[3];
+ a2 = pCoeffs[4];
+
+ /* Reading the state values */
+ d1a = pState[0];
+ d2a = pState[1];
+ d1b = pState[2];
+ d2b = pState[3];
+
+ pCoeffs += 5U;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 8 outputs at a time */
+ sample = blockSize >> 3U;
+
+ while (sample > 0U) {
+ /* y[n] = b0 * x[n] + d1 */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ /* d2 = b2 * x[n] + a2 * y[n] */
+
+/* 1 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 2 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 3 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 4 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 5 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 6 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 7 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+/* 8 */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ sample = blockSize & 0x7U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ sample = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (sample > 0U) {
+ /* Read the input */
+ Xn1a = *pIn++; /* Channel a */
+ Xn1b = *pIn++; /* Channel b */
+
+ /* y[n] = b0 * x[n] + d1 */
+ acc1a = (b0 * Xn1a) + d1a;
+ acc1b = (b0 * Xn1b) + d1b;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc1a;
+ *pOut++ = acc1b;
+
+ /* Every time after the output is computed state should be updated. */
+ /* d1 = b1 * x[n] + a1 * y[n] + d2 */
+ d1a = ((b1 * Xn1a) + (a1 * acc1a)) + d2a;
+ d1b = ((b1 * Xn1b) + (a1 * acc1b)) + d2b;
+
+ /* d2 = b2 * x[n] + a2 * y[n] */
+ d2a = (b2 * Xn1a) + (a2 * acc1a);
+ d2b = (b2 * Xn1b) + (a2 * acc1b);
+
+ /* decrement loop counter */
+ sample--;
+ }
+
+ /* Store the updated state variables back into the state array */
+ pState[0] = d1a;
+ pState[1] = d2a;
+
+ pState[2] = d1b;
+ pState[3] = d2b;
+
+ pState += 4U;
+
+ /* The current stage output is given as the input to the next stage */
+ pIn = pDst;
+
+ /* Reset the output working pointer */
+ pOut = pDst;
+
+ /* Decrement the loop counter */
+ stage--;
+
+ } while (stage > 0U);
+
+}
+LOW_OPTIMIZATION_EXIT
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of BiquadCascadeDF2T group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c
new file mode 100644
index 0000000000..633fbfef28
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f16.c
@@ -0,0 +1,90 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_biquad_cascade_stereo_df2T_init_f16.c
+ * Description: Initialization function for floating-point transposed direct form II Biquad cascade filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup BiquadCascadeDF2T
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in,out] S points to an instance of the filter data structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState is a pointer to state array.
+ Each Biquad stage has 2 state variables d1, and d2 for each channel.
+ The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_stereo_df2T_init_f16(
+ arm_biquad_cascade_stereo_df2T_instance_f16 * S,
+ uint8_t numStages,
+ const float16_t * pCoeffs,
+ float16_t * pState)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float16_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of BiquadCascadeDF2T group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
new file mode 100644
index 0000000000..6932b9a48b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_biquad_cascade_stereo_df2T_init_f32.c
@@ -0,0 +1,86 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_biquad_cascade_stereo_df2T_init_f32.c
+ * Description: Initialization function for floating-point transposed direct form II Biquad cascade filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup BiquadCascadeDF2T
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point transposed direct form II Biquad cascade filter.
+ @param[in,out] S points to an instance of the filter data structure.
+ @param[in] numStages number of 2nd order stages in the filter.
+ @param[in] pCoeffs points to the filter coefficients.
+ @param[in] pState points to the state buffer.
+ @return none
+
+ @par Coefficient and State Ordering
+ The coefficients are stored in the array pCoeffs in the following order:
+
+ {b10, b11, b12, a11, a12, b20, b21, b22, a21, a22, ...}
+
+ @par
+ where b1x and a1x are the coefficients for the first stage,
+ b2x and a2x are the coefficients for the second stage,
+ and so on. The pCoeffs array contains a total of 5*numStages values.
+ @par
+ The pState is a pointer to state array.
+ Each Biquad stage has 2 state variables d1, and d2 for each channel.
+ The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+ The state array has a total length of 2*numStages values.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_stereo_df2T_init_f32(
+ arm_biquad_cascade_stereo_df2T_instance_f32 * S,
+ uint8_t numStages,
+ const float32_t * pCoeffs,
+ float32_t * pState)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4U * (uint32_t) numStages) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of BiquadCascadeDF2T group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c
new file mode 100644
index 0000000000..0943864517
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_f32.c
@@ -0,0 +1,964 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_f32.c
+ * Description: Convolution of floating-point sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup Conv Convolution
+
+ Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
+ Convolution is similar to correlation and is frequently used in filtering and data analysis.
+ The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
+ The library also provides fast versions of the Q15 and Q31 functions.
+
+ @par Algorithm
+ Let a[n] and b[n] be sequences of length srcALen and
+ srcBLen samples respectively. Then the convolution
+
+ c[n] = a[n] * b[n]
+
+ @par
+ is defined as
+ \image html ConvolutionEquation.gif
+ @par
+ Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2.
+ pSrcA points to the first input vector of length srcALen and
+ pSrcB points to the second input vector of length srcBLen.
+ The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
+ @par
+ Conceptually, when two signals a[n] and b[n] are convolved,
+ the signal b[n] slides over a[n].
+ For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
+ @par
+ Note that convolution is a commutative operation:
+
+ a[n] * b[n] = b[n] * a[n].
+
+ @par
+ This means that switching the A and B arguments to the convolution functions has no effect.
+
+ @par Fixed-Point Behavior
+ Convolution requires summing up a large number of intermediate products.
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ Refer to the function specific documentation below for further details of the particular algorithm used.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+
+void arm_conv_f32(
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+ const float32_t *pIn1 = pSrcA; /* inputA pointer */
+ const float32_t *pIn2 = pSrcB; /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const float32_t *pX;
+ const float32_t *pY;
+ const float32_t *pA;
+ const float32_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t block1, block2, block3;
+ uint32_t vddupStartIdx = 3;
+ uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
+
+ if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2 - 3;
+
+ for (i = 0; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ float32_t acc0;
+ float32_t acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is incrementing for successive accumulators
+ * Y pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
+
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ pB += 2;
+ }
+
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ float32_t acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
+
+ *pDst++ = acc;
+ pB++;
+ }
+
+ for (i = 0; i <= block2 - 2; i += 2)
+ {
+ uint32_t count = srcBLen;
+ float32_t acc0 = 0;
+ float32_t acc1 = 0;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, count);
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ pA += 2;
+ }
+ if (block2 & 1)
+ {
+ uint32_t count = srcBLen;
+ float32_t acc = 0;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
+
+ *pDst++ = acc;
+ pA++;
+ }
+
+ for (i = block3; i >= 2; i -= 2)
+ {
+ int32_t count = i;
+ float32_t acc0;
+ float32_t acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is decrementing for successive accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
+
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ pA += 2;
+ }
+ for (; i >= 1; i--)
+ {
+ int32_t count = i;
+ float32_t acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CONV_SINGLE_F32(acc, pX, pY, count);
+
+ *pDst++ = acc;
+ pA++;
+ }
+}
+#else
+void arm_conv_f32(
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+
+#if defined(ARM_MATH_DSP)
+
+ const float32_t *pIn1; /* InputA pointer */
+ const float32_t *pIn2; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ float32_t acc0, acc1, acc2, acc3, c0; /* Accumulators */
+#if !defined(ARM_MATH_NEON)
+ float32_t x0, x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+#endif
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+#if defined(ARM_MATH_NEON)
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ res = vdupq_n_f32(0) ;
+ accum = vdup_n_f32(0);
+
+ /* Compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+
+ while (k > 0U)
+ {
+ vec1 = vld1q_f32(px);
+ vec2 = vld1q_f32(py-3);
+ vec2 = vrev64q_f32(vec2);
+ vec2 = vcombine_f32(vget_high_f32(vec2), vget_low_f32(vec2));
+
+ res = vmlaq_f32(res,vec1, vec2);
+
+ /* Increment pointers */
+ px += 4;
+ py -= 4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count & 3;
+#else
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#endif /* #if defined(ARM_MATH_NEON) */
+
+#else /* defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t c;
+ float32x4_t x1v;
+ float32x4_t x2v;
+ float32x4_t x;
+ float32x4_t res = vdupq_n_f32(0) ;
+#endif /* #if defined(ARM_MATH_NEON) */
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ res = vdupq_n_f32(0) ;
+
+ x1v = vld1q_f32(px);
+ x2v = vld1q_f32(px+4);
+
+ do
+ {
+ c = vld1q_f32(py-3);
+
+ px += 4;
+ x = x1v;
+ res = vmlaq_n_f32(res,x,c[3]);
+
+ x = vextq_f32(x1v,x2v,1);
+
+ res = vmlaq_n_f32(res,x,c[2]);
+
+ x = vextq_f32(x1v,x2v,2);
+
+ res = vmlaq_n_f32(res,x,c[1]);
+
+ x = vextq_f32(x1v,x2v,3);
+
+ res = vmlaq_n_f32(res,x,c[0]);
+
+ py -= 4;
+
+ x1v = x2v ;
+ x2v = vld1q_f32(px+4);
+
+ } while (--k);
+
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen & 0x3;
+
+ x1v = vld1q_f32(px);
+ px += 4;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ res = vmlaq_n_f32(res,x1v,c0);
+
+ /* Reuse the present samples for the next MAC */
+ x1v[0] = x1v[1];
+ x1v[1] = x1v[2];
+ x1v[2] = x1v[3];
+
+ x1v[3] = *(px++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ acc0 = res[0];
+ acc1 = res[1];
+ acc2 = res[2];
+ acc3 = res[3];
+
+#else
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read x[3] sample */
+ x3 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += x0 * c0;
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += x1 * c0;
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += x2 * c0;
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 += x3 * c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *py--;
+ /* Read x[4] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 += x0 * c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read x[5] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 += x1 * c0;
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *py--;
+ /* Read x[6] sample */
+ x2 = *(px + 3U);
+ px += 4U;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 += x2 * c0;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc0;
+ *pOut++ = acc1;
+ *pOut++ = acc2;
+ *pOut++ = acc3;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined (ARM_MATH_NEON)*/
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+#if defined (ARM_MATH_NEON)
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x4_t x = vdupq_n_f32(0) ;
+ float32x4_t y = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0) ;
+
+ /* First part of the processing. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py-3);
+
+ y = vrev64q_f32(y);
+ y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
+
+ res = vmlaq_f32(res,x,y);
+
+ px += 4 ;
+ py -= 4 ;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen & 0x3U;
+
+#else
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#endif /* if defined (ARM_MATH_NEON) */
+#else
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined(ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = blockSize3 >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x4_t x = vdupq_n_f32(0) ;
+ float32x4_t y = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0) ;
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py-3);
+
+ y = vrev64q_f32(y);
+ y = vcombine_f32(vget_high_f32(y), vget_low_f32(y));
+
+ res = vmlaq_f32(res,x,y);
+
+ px += 4 ;
+ py -= 4 ;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+#else
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+#endif /* #if defined (ARM_MATH_NEON) */
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = blockSize3 % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_NEON) || defined (ARM_MATH_LOOPUNROLL)*/
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ( pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = sum;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
new file mode 100644
index 0000000000..06629f1acf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
@@ -0,0 +1,366 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_fast_opt_q15.c
+ * Description: Fast Q15 Convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results
+ but provides only a single guard bit. There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ */
+
+void arm_conv_fast_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q31_t acc0; /* Accumulators */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x2, y2, acc0);
+ acc2 = __SMLAD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1U;
+ }
+
+}
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
new file mode 100644
index 0000000000..ad109dc110
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q15.c
@@ -0,0 +1,663 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_fast_q15.c
+ * Description: Fast Q15 Convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results
+ but provides only a single guard bit. There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ */
+
+void arm_conv_fast_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while ((count < 4U) && (blockSize1 > 0U))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + (count - 1U);
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ py = py - 1U;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ /* Read y[srcBLen - 7] */
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT((acc0 >> 15), (acc1 >> 15), 16));
+ write_q15x2_ia (&pOut, __PKHBT((acc2 >> 15), (acc3 >> 15), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT((acc1 >> 15), (acc0 >> 15), 16));
+ write_q15x2_ia (&pOut, __PKHBT((acc3 >> 15), (acc2 >> 15), 16));
+#endif /*#ifndef ARM_MATH_BIG_ENDIAN*/
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ pIn2 = pSrc2 - 1U;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = blockSize3 >> 2U;
+
+ while ((j > 0U) && (blockSize3 > 0U))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1U;
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
new file mode 100644
index 0000000000..133c322c2e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_fast_q31.c
@@ -0,0 +1,558 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_fast_q31.c
+ * Description: Fast Q31 Convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence.
+ @param[in] srcALen length of the first input sequence.
+ @param[in] pSrcB points to the second input sequence.
+ @param[in] srcBLen length of the second input sequence.
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ @remark
+ Refer to \ref arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ */
+
+void arm_conv_fast_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *py--;
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[srcBLen - 3] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[srcBLen - 3] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[srcBLen - 3] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *py--;
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 << 1);
+ *pOut++ = (q31_t) (acc1 << 1);
+ *pOut++ = (q31_t) (acc2 << 1);
+ *pOut++ = (q31_t) (acc3 << 1);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
new file mode 100644
index 0000000000..1f3efe00a1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q15.c
@@ -0,0 +1,362 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_opt_q15.c
+ * Description: Convolution of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ @remark
+ Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
+ */
+
+void arm_conv_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q63_t acc0; /* Accumulators */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1U) zeros in scratch1 buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x1, y1, acc0);
+ acc2 = __SMLALD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x2, y2, acc0);
+ acc2 = __SMLALD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1U;
+ }
+
+}
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
new file mode 100644
index 0000000000..1f12f0d2b6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_opt_q7.c
@@ -0,0 +1,360 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_opt_q7.c
+ * Description: Convolution of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ */
+
+void arm_conv_opt_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
+ q15_t x4; /* Temporary input variable */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ const q7_t *px; /* Temporary input1 pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ q7_t out0, out1, out2, out3; /* Temporary variables */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* points to smaller length sequence */
+ px = pIn2 + srcBLen - 1;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = (q7_t *) py;
+
+ pScr2 = py;
+
+ /* Actual convolution process starts here */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2U;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
+ out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
+ out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
+ out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
+
+ write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+ acc0 += (*pScr1++ * *pScr2++);
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1U;
+ }
+
+}
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
new file mode 100644
index 0000000000..a8a9bd1a33
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_f32.c
@@ -0,0 +1,678 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_f32.c
+ * Description: Partial convolution of floating-point sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup PartialConv Partial Convolution
+
+ Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
+ Each function has two additional arguments.
+ firstIndex specifies the starting index of the subset of output samples.
+ numPoints is the number of output samples to compute.
+ The function computes the output in the range
+ [firstIndex, ..., firstIndex+numPoints-1].
+ The output array pDst contains numPoints values.
+
+ The allowable range of output indices is [0 srcALen+srcBLen-2].
+ If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
+ Otherwise the functions return ARM_MATH_SUCCESS.
+ \note Refer to \ref arm_conv_f32() for details on fixed point behavior.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15 of partial convolution.
+ Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of partial convolution
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+ */
+
+arm_status arm_conv_partial_f32(
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+#if defined (ARM_MATH_DSP)
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc0, acc1, acc2, acc3; /* Accumulator */
+ float32_t x0, x1, x2, x3, c0; /* Temporary variables */
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
+ blockSize2 = ((int32_t) check - blockSize3) - (blockSize1 + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + firstIndex;
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc1;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += x0 * c0;
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += x1 * c0;
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += x2 * c0;
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 += x3 * c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *py--;
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 += x0 * c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 += x1 * c0;
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *py--;
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 += x2 * c0;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc0;
+ *pOut++ = acc1;
+ *pOut++ = acc2;
+ *pOut++ = acc3;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (uint32_t) blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const float32_t *pIn1 = pSrcA; /* InputA pointer */
+ const float32_t *pIn2 = pSrcB; /* InputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ( pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = sum;
+ }
+
+ /* Set status as ARM_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#endif /* defined(ARM_MATH_DSP) */
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
new file mode 100644
index 0000000000..a6b9ecf3be
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
@@ -0,0 +1,387 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_fast_opt_q15.c
+ * Description: Fast Q15 Partial convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+arm_status arm_conv_partial_fast_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q31_t acc0; /* Accumulator */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+ arm_status status; /* Status variable */
+ q31_t x1; /* Temporary variables to hold state and coefficient values */
+ q31_t y1; /* State variables */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulator */
+ q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y2; /* State variables */
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* Actual convolution process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (numPoints) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x2, y2, acc0);
+ acc2 = __SMLAD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc3 = __SMLADX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numPoints & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numPoints;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read two samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1U;
+
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
new file mode 100644
index 0000000000..3e928180b9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
@@ -0,0 +1,707 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_fast_q15.c
+ * Description: Fast Q15 Partial convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+ @remark
+ Refer to \ref arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+arm_status arm_conv_partial_fast_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while ((count < 4U) && (blockSize1 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2 - 1U;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
+
+ while (blkCnt > 0U)
+ {
+ py = py - 1U;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py + 1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD (x0, c0, acc0);
+ acc1 = __SMLAD (x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD (x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(acc0 >> 15, acc1 >> 15, 16));
+ write_q15x2_ia (&pOut, __PKHBT(acc2 >> 15, acc3 >> 15, 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(acc1 >> 15, acc0 >> 15, 16));
+ write_q15x2_ia (&pOut, __PKHBT(acc3 >> 15, acc2 >> 15, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ pIn2 = pSrc2 - 1U;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = count >> 2U;
+
+ while ((j > 0U) && (blockSize3 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1U;
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
new file mode 100644
index 0000000000..65269bc95e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
@@ -0,0 +1,625 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_fast_q31.c
+ * Description: Fast Q31 Partial convolution
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ */
+
+arm_status arm_conv_partial_fast_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0;
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *py--;
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *py--;
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 << 1);
+ *pOut++ = (q31_t) (acc1 << 1);
+ *pOut++ = (q31_t) (acc2 << 1);
+ *pOut++ = (q31_t) (acc3 << 1);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (uint32_t) blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py--))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
new file mode 100644
index 0000000000..7a9d6b1892
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
@@ -0,0 +1,386 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_opt_q15.c
+ * Description: Partial convolution of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
+ */
+
+arm_status arm_conv_partial_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q63_t acc0; /* Accumulator */
+ q31_t x1; /* Temporary variables to hold state and coefficient values */
+ q31_t y1; /* State variables */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* Loop counter */
+ uint32_t tapCnt; /* Loop count */
+ arm_status status; /* Status variable */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulator */
+ q31_t x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y2; /* State variables */
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* Actual convolution process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (numPoints) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x1, y1, acc0);
+ acc2 = __SMLALD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x2, y2, acc0);
+ acc2 = __SMLALD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numPoints & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numPoints;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read two samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ acc0 = __SMLALD(x1, y1, acc0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1U;
+
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
new file mode 100644
index 0000000000..a5c16c50a9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
@@ -0,0 +1,390 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_opt_q7.c
+ * Description: Partial convolution of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+ */
+
+arm_status arm_conv_partial_opt_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
+ q15_t x4; /* Temporary input variable */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ const q7_t *px; /* Temporary input1 pointer */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ arm_status status;
+ q7_t *pOut = pDst; /* Output pointer */
+ q7_t out0, out1, out2, out3; /* Temporary variables */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2;
+
+ /* points to smaller length sequence */
+ px = pIn2 + srcBLen - 1;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) *px--;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = (q7_t *) py;
+
+ pScr2 = py;
+
+ pOut = pDst + firstIndex;
+
+ pScratch1 += firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialize temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumulators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ out0 = (q7_t) (__SSAT(acc0 >> 7U, 8));
+ out1 = (q7_t) (__SSAT(acc1 >> 7U, 8));
+ out2 = (q7_t) (__SSAT(acc2 >> 7U, 8));
+ out3 = (q7_t) (__SSAT(acc3 >> 7U, 8));
+
+ write_q7x4_ia (&pOut, __PACKq7(out0, out1, out2, out3));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ blkCnt = (numPoints) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read two samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1U;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
new file mode 100644
index 0000000000..cfab5168fe
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -0,0 +1,759 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_q15.c
+ * Description: Partial convolution of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
+ */
+
+arm_status arm_conv_partial_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+#if defined (ARM_MATH_DSP)
+
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt, check;
+ arm_status status; /* Status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while ((count < 4U) && (blockSize1 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2 - 1U;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
+
+ while (blkCnt > 0U)
+ {
+ py = py - 1U;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLALDX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLALDX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLALDX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLALDX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLALDX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLALDX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD (x0, c0, acc0);
+ acc1 = __SMLALD (x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD (x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ pIn2 = pSrc2 - 1U;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = count >> 2U;
+
+ while ((j > 0U) && (blockSize3 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1U;
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q31_t) pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
new file mode 100644
index 0000000000..bcd52983c4
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q31.c
@@ -0,0 +1,640 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_q31.c
+ * Description: Partial convolution of Q31 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_fast_q31() for a faster but less precise implementation of this function.
+ */
+
+arm_status arm_conv_partial_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+#if defined(ARM_MATH_DSP)
+
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulator */
+ q31_t x0, x1, x2, c0; /* Temporary variables */
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unroll over blkCnt */
+ blkCnt = blockSize2 / 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *px++;
+ x1 = *px++;
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py);
+
+ /* Read x[2] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += (q63_t) x0 * c0;
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += (q63_t) x1 * c0;
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += (q63_t) x2 * c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py - 1U);
+
+ /* Read x[3] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += (q63_t) x1 * c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += (q63_t) x2 * c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += (q63_t) x0 * c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py - 2U);
+
+ /* Read x[4] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += (q63_t) x2 * c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += (q63_t) x0 * c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += (q63_t) x1 * c0;
+
+
+ px += 3U;
+
+ py -= 3U;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += (q63_t) x0 * c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += (q63_t) x1 * c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += (q63_t) x2 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 >> 31);
+ *pOut++ = (q31_t) (acc1 >> 31);
+ *pOut++ = (q31_t) (acc2 >> 31);
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q31_t) (sum >> 31U);
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
new file mode 100644
index 0000000000..116a8e696d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_partial_q7.c
@@ -0,0 +1,759 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_partial_q7.c
+ * Description: Partial convolution of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup PartialConv
+ @{
+ */
+
+/**
+ @brief Partial convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written
+ @param[in] firstIndex is the first output sample to start with
+ @param[in] numPoints is the number of output points to be computed
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : requested subset is not in the range [0 srcALen+srcBLen-2]
+
+ @remark
+ Refer to \ref arm_conv_partial_opt_q7() for a faster implementation of this function.
+ */
+
+arm_status arm_conv_partial_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+#if ARM_MATH_DSP
+
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulator */
+ uint32_t j, k, count, blkCnt, check; /* Loop counters */
+ int32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t)check > (int32_t)srcALen) ? (int32_t)check - (int32_t)srcALen : 0;
+ blockSize3 = ((int32_t)firstIndex > (int32_t)srcALen - 1) ? blockSize3 - (int32_t)firstIndex + (int32_t)srcALen : blockSize3;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1U)) ? blockSize1 : (int32_t)numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1U + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 1] , y[srcBLen - 2] */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[0] * y[srcBLen - 1] */
+ /* x[1] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 3] , y[srcBLen - 4] */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[2] * y[srcBLen - 3] */
+ /* x[3] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ if ((int32_t)firstIndex - (int32_t)srcBLen + 1 > 0)
+ {
+ pSrc1 = pIn1 + firstIndex - srcBLen + 1;
+ }
+ else
+ {
+ pSrc1 = pIn1;
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = ((uint32_t) blockSize2 >> 2U);
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read y[srcBLen - 2] sample */
+ c1 = *py--;
+
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read y[srcBLen - 4] sample */
+ c1 = *py--;
+
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q31_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q31_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q31_t) x2 * c0);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += ((q31_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (uint32_t) blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pSrc1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ if (firstIndex > srcALen)
+ {
+ pSrc1 = (pIn1 + firstIndex) - (srcBLen - 1U);
+ }
+ else
+ {
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ }
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB; /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+ arm_status status; /* Status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if ((firstIndex + numPoints) > ((srcALen + (srcBLen - 1U))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+
+/**
+ @} end of PartialConv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c
new file mode 100644
index 0000000000..1508b41338
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q15.c
@@ -0,0 +1,858 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_q15.c
+ * Description: Convolution of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_conv_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+void arm_conv_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+ const q15_t *pIn1 = pSrcA; /* inputA pointer */
+ const q15_t *pIn2 = pSrcB; /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const q15_t *pX;
+ const q15_t *pY;
+ const q15_t *pA;
+ const q15_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t block1, block2, block3;
+
+
+ uint16x8_t decrIdxVec = vddupq_u16(7, 1);
+
+
+ if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2 - 7;
+
+ for (i = 0; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
+ *pDst++ = (q15_t) acc0;
+ *pDst++ = (q15_t) acc1;
+ pB += 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
+ *pDst++ = (q15_t) acc;
+ pB++;
+ }
+
+ for (i = 0; i <= block2 - 4; i += 4)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+ int64_t acc2 = 0LL;
+ int64_t acc3 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, count);
+ *pDst++ = (q15_t) acc0;
+ *pDst++ = (q15_t) acc1;
+ *pDst++ = (q15_t) acc2;
+ *pDst++ = (q15_t) acc3;
+
+ pA += 4;
+ }
+ for (; i <= block2 - 2; i += 2)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, count);
+ *pDst++ = (q15_t) acc0;
+ *pDst++ = (q15_t) acc1;
+
+ pA += 2;
+ }
+ if (block2 & 1)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
+ *pDst++ = (q15_t) acc;
+ pA++;
+ }
+
+ for (i = block3; i >= 1; i -= 2)
+ {
+ uint32_t count = i;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
+ *pDst++ = (q15_t) acc0;
+ *pDst++ = (q15_t) acc1;
+ pA += 2;
+ }
+ for (; i >= 1; i--)
+ {
+ uint32_t count = i;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q15(acc, pX, pY, count);
+ *pDst++ = (q15_t) acc;
+ pA++;
+ }
+}
+#else
+void arm_conv_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+
+#if defined (ARM_MATH_DSP)
+
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while ((count < 4U) && (blockSize1 > 0U))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + (count - 1U);
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ py = py - 1U;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLALDX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLALDX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLALDX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = read_q15x2_da ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLALDX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLALDX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+
+ px += 4U;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLALDX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py + 1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD(x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pOut, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+ blockSize3 = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ pIn2 = pSrc2 - 1U;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = blockSize3 >> 2U;
+
+ while ((j > 0U) && (blockSize3 > 0U))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(read_q15x2_ia ((q15_t **) &px), read_q15x2_da ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1U;
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4U;
+
+ while (k > 0U)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1U;
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+
+ /* Loop to calculate convolution for output length number of values */
+ for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q31_t) pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q15_t) __SSAT((sum >> 15U), 16U);
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c
new file mode 100644
index 0000000000..ee37824fa8
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q31.c
@@ -0,0 +1,745 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_q31.c
+ * Description: Convolution of Q31 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_conv_fast_q31() for a faster but less precise implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+void arm_conv_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ const q31_t *pIn1 = pSrcA; /* inputA pointer */
+ const q31_t *pIn2 = pSrcB; /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const q31_t *pX;
+ const q31_t *pY;
+ const q31_t *pA;
+ const q31_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t block1, block2, block3;
+ uint32_t vddupStartIdx = 3;
+ uint32x4_t decrIdxVec = vddupq_u32(vddupStartIdx, 1);
+
+ if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2 - 3;
+
+ for (i = 0; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
+
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ pB += 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
+
+ *pDst++ = (q31_t) acc;
+ pB++;
+ }
+
+ for (i = 0; i <= block2 - 4; i += 4)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+ int64_t acc2 = 0LL;
+ int64_t acc3 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, count);
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ *pDst++ = (q31_t) acc2;
+ *pDst++ = (q31_t) acc3;
+
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, count);
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+
+ pA += 2;
+ }
+ if (block2 & 1)
+ {
+ uint32_t count = srcBLen;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
+ *pDst++ = (q31_t) acc;
+ pA++;
+ }
+
+ for (i = block3; i >= 2; i -= 2)
+ {
+ uint32_t count = i;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ pA += 2;
+ }
+
+ for (; i >= 1; i--)
+ {
+ uint32_t count = i;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q31(acc, pX, pY, count);
+ *pDst++ = (q31_t) acc;
+ pA++;
+ }
+
+}
+
+#else
+void arm_conv_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * (*py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unroll by 3 */
+ blkCnt = blockSize2 / 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py);
+ /* Read x[3] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py - 1U);
+ /* Read x[4] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += ((q63_t) x1 * c0);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += ((q63_t) x2 * c0);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py - 2U);
+ /* Read x[5] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += ((q63_t) x2 * c0);
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += ((q63_t) x0 * c0);
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += ((q63_t) x1 * c0);
+
+ /* update scratch pointers */
+ px += 3U;
+ py -= 3U;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 >> 31);
+ *pOut++ = (q31_t) (acc1 >> 31);
+ *pOut++ = (q31_t) (acc2 >> 31);
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = blockSize3 >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = blockSize3 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += (q63_t) *px++ * *py--;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB; /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i, j; /* Loop counters */
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q31_t) (sum >> 31U);
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c
new file mode 100644
index 0000000000..e29a16dd17
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_conv_q7.c
@@ -0,0 +1,860 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_conv_q7.c
+ * Description: Convolution of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Conv
+ @{
+ */
+
+/**
+ @brief Convolution of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ @remark
+ Refer to \ref arm_conv_opt_q7() for a faster implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+
+#include "arm_vec_filtering.h"
+
+void arm_conv_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+ const q7_t *pIn1 = pSrcA; /* inputA pointer */
+ const q7_t *pIn2 = pSrcB; /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const q7_t *pX;
+ const q7_t *pY;
+ const q7_t *pA;
+ const q7_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t block1, block2, block3;
+ uint8_t vddupStartIdx = 15;
+ uint8x16_t decrIdxVec = vddupq_u8(vddupStartIdx, 1);
+
+ if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2 - 15;
+
+ for (i = 0; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_DUAL_INC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
+ *pDst++ = (q7_t) acc0;
+ *pDst++ = (q7_t) acc1;
+ pB += 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int32_t acc = 0;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
+ *pDst++ = (q7_t) acc;
+ pB++;
+ }
+
+ for (i = 0; i <= block2 - 4; i += 4)
+ {
+ uint32_t count = srcBLen;
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+ int32_t acc2 = 0;
+ int32_t acc3 = 0;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, count);
+ *pDst++ = (q7_t) acc0;
+ *pDst++ = (q7_t) acc1;
+ *pDst++ = (q7_t) acc2;
+ *pDst++ = (q7_t) acc3;
+ pA += 4;
+ }
+ for (; i <= block2 - 2; i += 2)
+ {
+ uint32_t count = srcBLen;
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CONV_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, count);
+ *pDst++ = (q7_t) acc0;
+ *pDst++ = (q7_t) acc1;
+ pA += 2;
+ }
+ if (block2 & 1)
+ {
+ uint32_t count = srcBLen;
+ int32_t acc = 0;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
+ *pDst++ = (q7_t) acc;
+ pA++;
+ }
+
+ for (i = block3; i >= 1; i -= 2)
+ {
+ uint32_t count = i;
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
+ *pDst++ = (q7_t) acc0;
+ *pDst++ = (q7_t) acc1;
+ pA += 2;
+ }
+ for (; i >= 1; i--)
+ {
+ uint32_t count = i;
+ int32_t acc = 0;
+
+ pX = pA;
+ pY = pB;
+
+ MVE_INTR_CONV_SINGLE_Q7(acc, pX, pY, count);
+ *pDst++ = (q7_t) acc;
+ pA++;
+ }
+}
+
+#else
+void arm_conv_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[srcBLen - 1] , y[srcBLen - 2] */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* x[0] * y[srcBLen - 1] */
+ /* x[1] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[srcBLen - 3] , y[srcBLen - 4] */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* x[2] * y[srcBLen - 3] */
+ /* x[3] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *py--;
+ /* Read y[srcBLen - 2] sample */
+ c1 = *py--;
+
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *py--;
+ /* Read y[srcBLen - 4] sample */
+ c1 = *py--;
+
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *py--;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q15_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q15_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q15_t) x2 * c0);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7U, 8));
+ *pOut++ = (q7_t) (__SSAT(acc1 >> 7U, 8));
+ *pOut++ = (q7_t) (__SSAT(acc2 >> 7U, 8));
+ *pOut++ = (q7_t) (__SSAT(acc3 >> 7U, 8));
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1U);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = blockSize3 >> 2U;
+
+ while (k > 0U)
+ {
+ /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
+ in1 = (q15_t) *py--;
+ in2 = (q15_t) *py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = blockSize3 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = blockSize3;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q15_t) *px++ * *py--);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7U, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB; /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* Loop counters */
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i < (srcALen + srcBLen - 1U); i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q15_t) pIn1[j] * pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q7_t) __SSAT((sum >> 7U), 8U);
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Conv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c
new file mode 100644
index 0000000000..d113d4572e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f16.c
@@ -0,0 +1,1159 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_f16.c
+ * Description: Correlation of floating-point sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup Corr Correlation
+
+ Correlation is a mathematical operation that is similar to convolution.
+ As with convolution, correlation uses two signals to produce a third signal.
+ The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
+ Correlation is commonly used to measure the similarity between two signals.
+ It has applications in pattern recognition, cryptanalysis, and searching.
+ The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
+ Fast versions of the Q15 and Q31 functions are also provided.
+
+ @par Algorithm
+ Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
+ The convolution of the two signals is denoted by
+
+ c[n] = a[n] * b[n]
+
+ In correlation, one of the signals is flipped in time
+
+ c[n] = a[n] * b[-n]
+
+ @par
+ and this is mathematically defined as
+ \image html CorrelateEquation.gif
+ @par
+ The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen.
+ The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2).
+ The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result.
+
+ @note
+ The pDst should be initialized to all zeros before being used.
+
+ @par Fixed-Point Behavior
+ Correlation requires summing up a large number of intermediate products.
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ Refer to the function specific documentation below for further details of the particular algorithm used.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+#define MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count) \
+{ \
+ float16_t const *pSrcX, *pSrcY; \
+ f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f16(0.0f); \
+ acc1Vec = vdupq_n_f16(0.0f); \
+ pSrcX = (float16_t const *) pX; \
+ pSrcY = (float16_t const *) pY; \
+ k = count >> 3; \
+ while (k > 0U) \
+ { \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_f16(&pSrcY[-1]); \
+ acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ k = count % 0x8U; \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires 1 additional sample */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ mve_pred16_t p0 = vctp16q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ yVec = vldrhq_f16(&pSrcY[-1]); \
+ acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec,p0); \
+ /* acc0 requires exact number of sample */ \
+ /* disable extra lanes in final MAC computation */ \
+ p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec,p0); \
+ \
+ acc0 = vecAddAcrossF16Mve(acc0Vec); \
+ acc1 = vecAddAcrossF16Mve(acc1Vec); \
+}
+
+#define MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count) \
+{ \
+ float16_t const *pSrcX, *pSrcY; \
+ f16x8_t accVec, xVec, yVec; \
+ uint16_t k; \
+ \
+ accVec = vdupq_n_f16(0.0f); \
+ pSrcX = (float16_t const *) pX; \
+ pSrcY = (float16_t const *) pY; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ accVec = vfmaq(accVec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* Loop with tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ accVec = vfmaq_m(accVec, xVec, yVec, p0); \
+ } \
+ \
+ acc = vecAddAcrossF16Mve(accVec); \
+}
+
+#define MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, count) \
+{ \
+ float16_t const *pSrcX, *pSrcY; \
+ f16x8_t acc0Vec, acc1Vec, acc2Vec, acc3Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f16(0.0f); \
+ acc1Vec = vdupq_n_f16(0.0f); \
+ acc2Vec = vdupq_n_f16(0.0f); \
+ acc3Vec = vdupq_n_f16(0.0f); \
+ pSrcX = (float16_t const *) pX; \
+ pSrcY = (float16_t const *) pY; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
+ xVec = vldrhq_f16(&pSrcX[2]); \
+ acc2Vec = vfmaq_f16(acc2Vec, xVec, yVec); \
+ xVec = vldrhq_f16(&pSrcX[3]); \
+ acc3Vec = vfmaq_f16(acc3Vec, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
+ xVec = vldrhq_f16(&pSrcX[2]); \
+ acc2Vec = vfmaq_m_f16(acc2Vec, xVec, yVec, p0); \
+ xVec = vldrhq_f16(&pSrcX[3]); \
+ acc3Vec = vfmaq_m_f16(acc3Vec, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = vecAddAcrossF16Mve(acc0Vec); \
+ acc1 = vecAddAcrossF16Mve(acc1Vec); \
+ acc2 = vecAddAcrossF16Mve(acc2Vec); \
+ acc3 = vecAddAcrossF16Mve(acc3Vec); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, count) \
+{ \
+ float16_t const *pSrcX, *pSrcY; \
+ f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f16(0.0f); \
+ acc1Vec = vdupq_n_f16(0.0f); \
+ pSrcX = (float16_t const *) pX; \
+ pSrcY = (float16_t const *) pY; \
+ k = count >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* loop + tail predication expected here */ \
+ k = count % 0x8U; \
+ if (k > 0U) \
+ { \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8;; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
+ } \
+ \
+ acc0 = vecAddAcrossF16Mve(acc0Vec); \
+ acc1 = vecAddAcrossF16Mve(acc1Vec); \
+}
+
+#define MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count) \
+{ \
+ float16_t const *pSrcX, *pSrcY; \
+ f16x8_t acc0Vec, acc1Vec, xVec, yVec; \
+ uint32_t k; \
+ \
+ acc0Vec = vdupq_n_f16(0.0f); \
+ acc1Vec = vdupq_n_f16(0.0f); \
+ pSrcX = (float16_t const *) pX; \
+ pSrcY = (float16_t const *) pY; \
+ k = (count-1) >> 3; \
+ \
+ while (k > 0U) \
+ { \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_f16(acc1Vec, xVec, yVec); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_f16(acc0Vec, xVec, yVec); \
+ /* Decrement the loop counter */ \
+ k--; \
+ } \
+ /* use predication to finalize MAC sum */ \
+ /* acc1 requires exact number of sample (count-1) */ \
+ /* disable extra lanes in final MAC computation */ \
+ k = (count-1) % 0x8U; \
+ mve_pred16_t p0 = vctp16q(k); \
+ yVec = vld1q(pSrcY); pSrcY += 8; \
+ xVec = vldrhq_f16(&pSrcX[1]); \
+ acc1Vec = vfmaq_m_f16(acc1Vec, xVec, yVec, p0); \
+ /* acc0 requires 1 additional sample (count) */ \
+ /* so add 1 to unmask an extra lane in final MAC computation */ \
+ p0 = vctp16q(k+1); \
+ xVec = vld1q(pSrcX); pSrcX += 8; \
+ acc0Vec = vfmaq_m_f16(acc0Vec, xVec, yVec, p0); \
+ \
+ acc0 = vecAddAcrossF16Mve(acc0Vec); \
+ acc1 = vecAddAcrossF16Mve(acc1Vec); \
+}
+
+
+
+void arm_correlate_f16(
+ const float16_t * pSrcA,
+ uint32_t srcALen,
+ const float16_t * pSrcB,
+ uint32_t srcBLen,
+ float16_t * pDst)
+{
+ float16_t *pIn1 = (float16_t *)pSrcA; /* inputA pointer */
+ float16_t *pIn2 = (float16_t *)pSrcB + (srcBLen - 1U); /* inputB pointer */
+ float16_t *pX;
+ float16_t *pY;
+ float16_t *pA;
+ float16_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t inv = 2U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+ int32_t block1, block2, block3;
+ int32_t incr;
+
+ tot = ((srcALen + srcBLen) - 2U);
+ if (srcALen > srcBLen)
+ {
+ /*
+ * Calculating the number of zeros to be padded to the output
+ */
+ j = srcALen - srcBLen;
+ /*
+ * Initialize the pointer after zero padding
+ */
+ pDst += j;
+ }
+ else if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = (float16_t *)pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = (float16_t *)pSrcA + (srcALen - 1U);
+ /*
+ * Initialisation of the pointer after zero padding
+ */
+ pDst = pDst + tot;
+ /*
+ * Swapping the lengths
+ */
+
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ /*
+ * Setting the reverse flag
+ */
+ inv = -2;
+
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2;
+ incr = inv / 2;
+
+ for (i = 0U; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ _Float16 acc0;
+ _Float16 acc1;
+ /*
+ * compute 2 accumulators per loop
+ * size is incrementing for second accumulator
+ * Y pointer is decrementing for second accumulator
+ */
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F16(acc0, acc1, pX, pY, count);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pB -= 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ _Float16 acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
+
+ *pDst = acc;
+ pDst += incr;
+ pB--;
+ }
+
+ for (i = 0U; i <= block2 - 4; i += 4)
+ {
+ _Float16 acc0;
+ _Float16 acc1;
+ _Float16 acc2;
+ _Float16 acc3;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F16(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ *pDst = acc2;
+ pDst += incr;
+ *pDst = acc3;
+ pDst += incr;
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ _Float16 acc0;
+ _Float16 acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F16(acc0, acc1, pX, pY, srcBLen);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pA += 2;
+ }
+
+ if (block2 & 1)
+ {
+ _Float16 acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, srcBLen);
+
+ *pDst = acc;
+ pDst += incr;
+ pA++;
+ }
+
+ for (i = block3 - 1; i >= 0; i -= 2)
+ {
+
+ uint32_t count = (i + 1);
+ _Float16 acc0;
+ _Float16 acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is decrementing for second accumulator
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F16(acc0, acc1, pX, pY, count);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pA += 2;
+
+ }
+ for (; i >= 0; i--)
+ {
+ uint32_t count = (i + 1);
+ _Float16 acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F16(acc, pX, pY, count);
+
+ *pDst = acc;
+ pDst += incr;
+ pA++;
+ }
+}
+
+#else
+void arm_correlate_f16(
+ const float16_t * pSrcA,
+ uint32_t srcALen,
+ const float16_t * pSrcB,
+ uint32_t srcBLen,
+ float16_t * pDst)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ const float16_t *pIn1; /* InputA pointer */
+ const float16_t *pIn2; /* InputB pointer */
+ float16_t *pOut = pDst; /* Output pointer */
+ const float16_t *px; /* Intermediate inputA pointer */
+ const float16_t *py; /* Intermediate inputB pointer */
+ const float16_t *pSrc1;
+ _Float16 sum;
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ _Float16 acc0, acc1, acc2, acc3,c0; /* Accumulators */
+ _Float16 x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we assume zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding has to be done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum += *px++ * *py++;
+
+ /* x[1] * y[srcBLen - 3] */
+ sum += *px++ * *py++;
+
+ /* x[2] * y[srcBLen - 2] */
+ sum += *px++ * *py++;
+
+ /* x[3] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f16;
+ acc1 = 0.0f16;
+ acc2 = 0.0f16;
+ acc3 = 0.0f16;
+
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py++);
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 += x0 * c0;
+ /* acc1 += x[1] * y[0] */
+ acc1 += x1 * c0;
+ /* acc2 += x[2] * y[0] */
+ acc2 += x2 * c0;
+ /* acc3 += x[3] * y[0] */
+ acc3 += x3 * c0;
+
+ /* Read y[1] sample */
+ c0 = *(py++);
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[1] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[1] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[1] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[1] */
+ acc3 += x0 * c0;
+
+ /* Read y[2] sample */
+ c0 = *(py++);
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[2] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[2] */
+ acc3 += x1 * c0;
+
+ /* Read y[3] sample */
+ c0 = *(py++);
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[3] * y[3] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[3] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[3] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[3] */
+ acc3 += x2 * c0;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[4] * y[4] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[4] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[4] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[4] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = acc0;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = acc1;
+ pOut += inc;
+
+ *pOut = acc2;
+ pOut += inc;
+
+ *pOut = acc3;
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f16;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f16;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const float16_t *pIn1 = pSrcA; /* inputA pointer */
+ const float16_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ _Float16 sum; /* Accumulator */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we assume zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2U);
+
+ if (srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if (srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1U);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0.0f16;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if ((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ if (inv == 1)
+ *pDst-- = sum;
+ else
+ *pDst++ = sum;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Corr group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c
new file mode 100644
index 0000000000..30dc4dd535
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_f32.c
@@ -0,0 +1,1095 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_f32.c
+ * Description: Correlation of floating-point sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup Corr Correlation
+
+ Correlation is a mathematical operation that is similar to convolution.
+ As with convolution, correlation uses two signals to produce a third signal.
+ The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
+ Correlation is commonly used to measure the similarity between two signals.
+ It has applications in pattern recognition, cryptanalysis, and searching.
+ The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
+ Fast versions of the Q15 and Q31 functions are also provided.
+
+ @par Algorithm
+ Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
+ The convolution of the two signals is denoted by
+
+ c[n] = a[n] * b[n]
+
+ In correlation, one of the signals is flipped in time
+
+ c[n] = a[n] * b[-n]
+
+ @par
+ and this is mathematically defined as
+ \image html CorrelateEquation.gif
+ @par
+ The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen.
+ The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2).
+ The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result.
+
+ @note
+ The pDst should be initialized to all zeros before being used.
+
+ @par Fixed-Point Behavior
+ Correlation requires summing up a large number of intermediate products.
+ As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ Refer to the function specific documentation below for further details of the particular algorithm used.
+
+ @par Fast Versions
+ Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
+ the input signals should be scaled down to avoid intermediate overflows.
+
+ @par Opt Versions
+ Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ These versions are optimised in cycles and consumes more memory (Scratch memory) compared to Q15 and Q7 versions of correlate
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of floating-point sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+
+void arm_correlate_f32(
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+ const float32_t *pIn1 = pSrcA; /* inputA pointer */
+ const float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ const float32_t *pX, *pY;
+ const float32_t *pA, *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t inv = 4U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+ int32_t block1, block2, block3;
+ int32_t incr;
+
+ tot = ((srcALen + srcBLen) - 2U);
+ if (srcALen > srcBLen)
+ {
+ /*
+ * Calculating the number of zeros to be padded to the output
+ */
+ j = srcALen - srcBLen;
+ /*
+ * Initialize the pointer after zero padding
+ */
+ pDst += j;
+ }
+ else if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA + (srcALen - 1U);
+ /*
+ * Initialisation of the pointer after zero padding
+ */
+ pDst = pDst + tot;
+ /*
+ * Swapping the lengths
+ */
+
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ /*
+ * Setting the reverse flag
+ */
+ inv = -4;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2;
+ incr = inv / 4;
+
+ for (i = 0U; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ float32_t acc0;
+ float32_t acc1;
+
+ /*
+ * compute 2 accumulators per loop
+ * size is incrementing for second accumulator
+ * Y pointer is decrementing for second accumulator
+ */
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_F32(acc0, acc1, pX, pY, count);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pB -= 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ float32_t acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
+
+ *pDst = acc;
+ pDst += incr;
+ pB--;
+ }
+
+ for (i = 0U; i <= block2 - 4; i += 4)
+ {
+ float32_t acc0;
+ float32_t acc1;
+ float32_t acc2;
+ float32_t acc3;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_F32(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ *pDst = acc2;
+ pDst += incr;
+ *pDst = acc3;
+ pDst += incr;
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ float32_t acc0;
+ float32_t acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_F32(acc0, acc1, pX, pY, srcBLen);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pA += 2;
+ }
+
+ if (block2 & 1)
+ {
+ float32_t acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, srcBLen);
+
+ *pDst = acc;
+ pDst += incr;
+ pA++;
+ }
+
+ for (i = block3 - 1; i >= 0; i -= 2)
+ {
+
+ uint32_t count = (i + 1);
+ float32_t acc0;
+ float32_t acc1;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is decrementing for second accumulator
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_F32(acc0, acc1, pX, pY, count);
+
+ *pDst = acc0;
+ pDst += incr;
+ *pDst = acc1;
+ pDst += incr;
+ pA += 2;
+
+ }
+ for (; i >= 0; i--)
+ {
+ uint32_t count = (i + 1);
+ float32_t acc;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_F32(acc, pX, pY, count);
+
+ *pDst = acc;
+ pDst += incr;
+ pA++;
+ }
+}
+
+#else
+void arm_correlate_f32(
+ const float32_t * pSrcA,
+ uint32_t srcALen,
+ const float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+
+#if defined(ARM_MATH_DSP) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ const float32_t *pIn1; /* InputA pointer */
+ const float32_t *pIn2; /* InputB pointer */
+ float32_t *pOut = pDst; /* Output pointer */
+ const float32_t *px; /* Intermediate inputA pointer */
+ const float32_t *py; /* Intermediate inputB pointer */
+ const float32_t *pSrc1;
+ float32_t sum;
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+ float32_t acc0, acc1, acc2, acc3,c0; /* Accumulators */
+#if !defined(ARM_MATH_NEON)
+ float32_t x0, x1, x2, x3; /* temporary variables for holding input and coefficient values */
+#endif
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we assume zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding has to be done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+
+ k = count & 0x3;
+#else
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum += *px++ * *py++;
+
+ /* x[1] * y[srcBLen - 3] */
+ sum += *px++ * *py++;
+
+ /* x[2] * y[srcBLen - 2] */
+ sum += *px++ * *py++;
+
+ /* x[3] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t c;
+ float32x4_t x1v;
+ float32x4_t x2v;
+ float32x4_t x;
+ float32x4_t res = vdupq_n_f32(0) ;
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+#if defined(ARM_MATH_NEON)
+ /* Compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ res = vdupq_n_f32(0) ;
+
+ x1v = vld1q_f32(px);
+ px += 4;
+ do
+ {
+ x2v = vld1q_f32(px);
+ c = vld1q_f32(py);
+
+ py += 4;
+
+ x = x1v;
+ res = vmlaq_n_f32(res,x,c[0]);
+
+ x = vextq_f32(x1v,x2v,1);
+
+ res = vmlaq_n_f32(res,x,c[1]);
+
+ x = vextq_f32(x1v,x2v,2);
+
+ res = vmlaq_n_f32(res,x,c[2]);
+
+ x = vextq_f32(x1v,x2v,3);
+
+ res = vmlaq_n_f32(res,x,c[3]);
+
+ x1v = x2v;
+ px+=4;
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen & 0x3;
+
+ while (k > 0U)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py++);
+
+ res = vmlaq_n_f32(res,x1v,c0);
+
+ /* Reuse the present samples for the next MAC */
+ x1v[0] = x1v[1];
+ x1v[1] = x1v[2];
+ x1v[2] = x1v[3];
+
+ x1v[3] = *(px++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ px-=1;
+
+ acc0 = res[0];
+ acc1 = res[1];
+ acc2 = res[2];
+ acc3 = res[3];
+#else
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py++);
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 += x0 * c0;
+ /* acc1 += x[1] * y[0] */
+ acc1 += x1 * c0;
+ /* acc2 += x[2] * y[0] */
+ acc2 += x2 * c0;
+ /* acc3 += x[3] * y[0] */
+ acc3 += x3 * c0;
+
+ /* Read y[1] sample */
+ c0 = *(py++);
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[1] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[1] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[1] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[1] */
+ acc3 += x0 * c0;
+
+ /* Read y[2] sample */
+ c0 = *(py++);
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[2] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[2] */
+ acc3 += x1 * c0;
+
+ /* Read y[3] sample */
+ c0 = *(py++);
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[3] * y[3] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[3] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[3] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[3] */
+ acc3 += x2 * c0;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[4] * y[4] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[4] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[4] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[4] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif /* #if defined(ARM_MATH_NEON) */
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = acc0;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = acc1;
+ pOut += inc;
+
+ *pOut = acc2;
+ pOut += inc;
+
+ *pOut = acc3;
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+#else
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+#endif /* #if defined(ARM_MATH_NEON) */
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+#if defined(ARM_MATH_NEON)
+ float32x4_t x,y;
+ float32x4_t res = vdupq_n_f32(0) ;
+ float32x2_t accum = vdup_n_f32(0);
+
+ while (k > 0U)
+ {
+ x = vld1q_f32(px);
+ y = vld1q_f32(py);
+
+ res = vmlaq_f32(res,x, y);
+
+ px += 4;
+ py += 4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(res), vget_high_f32(res));
+ sum += accum[0] + accum[1];
+#else
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+#endif /* #if defined (ARM_MATH_NEON) */
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) || defined(ARM_MATH_NEON) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const float32_t *pIn1 = pSrcA; /* inputA pointer */
+ const float32_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we assume zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2U);
+
+ if (srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if (srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1U);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if ((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)];
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ if (inv == 1)
+ *pDst-- = sum;
+ else
+ *pDst++ = sum;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
new file mode 100644
index 0000000000..0b37b18741
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
@@ -0,0 +1,345 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_fast_opt_q15.c
+ * Description: Fast Q15 Correlation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence.
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+void arm_correlate_fast_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
+{
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q31_t acc0; /* Accumulators */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1 = pScratch; /* Temporary pointer for scratch */
+ const q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, blkCnt, outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+ uint32_t tapCnt; /* Loop count */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables for holding input and coefficient values */
+ q31_t y1, y2; /* State variables */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ pScr1 = pScratch;
+
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pIn2;
+
+
+ /* Actual correlation process starts here */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x2, y2, acc0);
+ acc2 = __SMLAD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut = (__SSAT(acc0 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc1 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc2 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc3 >> 15U, 16));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 4U;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read next two samples from scratch buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 1U;
+ }
+
+}
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
new file mode 100644
index 0000000000..9de7da679f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q15.c
@@ -0,0 +1,614 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_fast_q15.c
+ * Description: Fast Q15 Correlation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q15 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+void arm_correlate_fast_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+ /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum = __SMLAD(*px++, *py++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the first two inputB samples using SIMD:
+ * y[0] and y[1] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLAD(x3, c0, acc3);
+
+ /* Read y[2] and y[3] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLAD(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLAD(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLAD(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLAD(x1, c0, acc3);
+
+ } while (--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[4] */
+ c0 = *py;
+
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD (x0, c0, acc0);
+ acc1 = __SMLAD (x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[4], y[5] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[4], y[5] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+
+ c0 = (*py);
+ /* Read y[6] */
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD (x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (acc0 >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q15_t) (acc1 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc2 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc3 >> 15);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+ sum += ((q31_t) *px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) *px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = __SMLAD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py++, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
new file mode 100644
index 0000000000..e3d2bbf8c3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_fast_q31.c
@@ -0,0 +1,601 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_fast_q31.c
+ * Description: Fast Q31 Correlation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q31 sequences (fast version).
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+
+ @remark
+ Refer to \ref arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ */
+
+void arm_correlate_fast_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* x[1] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* x[2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* x[3] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *py++;
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[1] * y[0] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[2] * y[0] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[3] * y[0] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+
+ /* Read y[1] sample */
+ c0 = *py++;
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+
+ /* Read y[2] sample */
+ c0 = *py++;
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+
+ /* Read y[3] sample */
+ c0 = *py++;
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[3] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[3] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[3] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[4] sample */
+ c0 = *py++;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (acc0 << 1);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q31_t) (acc1 << 1);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc2 << 1);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc3 << 1);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1U;
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) *px++ * (*py++))) >> 32);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
new file mode 100644
index 0000000000..00bb08897a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q15.c
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_opt_q15.c
+ * Description: Correlation of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
+ */
+
+void arm_correlate_opt_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
+{
+ q63_t acc0; /* Accumulators */
+ q15_t *pOut = pDst; /* Output pointer */
+ q15_t *pScr1; /* Temporary pointer for scratch1 */
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, blkCnt, outBlockSize; /* Loop counter */
+ int32_t inc = 1; /* Output pointer increment */
+ uint32_t tapCnt;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables for holding input1 and input2 values */
+ q31_t y1, y2; /* State variables */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (srcALen * 2U) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ pScr1 = pScratch;
+
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pIn2;
+
+
+ /* Actual correlation process starts here */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia ((q15_t **) &pIn2);
+ y2 = read_q15x2_ia ((q15_t **) &pIn2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x1, y1, acc0);
+ acc2 = __SMLALD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* multiply and accumulate */
+ acc0 = __SMLALD(x2, y2, acc0);
+ acc2 = __SMLALD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut = (__SSAT(acc0 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc1 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc2 >> 15U, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc3 >> 15U, 16));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 4U;
+ }
+
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (srcALen + srcBLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 1U;
+ }
+
+}
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
new file mode 100644
index 0000000000..145ba2ae8f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_opt_q7.c
@@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_opt_q7.c
+ * Description: Correlation of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @param[in] pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ @param[in] pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ */
+
+void arm_correlate_opt_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
+ q15_t x4; /* Temporary input variable */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ const q7_t *pIn1, *pIn2; /* InputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* Loop counter */
+ int32_t inc = 1; /* Output pointer increment */
+ uint32_t outBlockSize; /* Loop counter */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ q7_t *pOut = pDst; /* Output pointer */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (srcALen * 2U) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+
+ /* Copy (srcBLen) samples in scratch buffer */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) *pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) *pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) *pIn2++;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) *pIn2++;
+ *pScr2++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1U) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2U;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ a second loop below copies for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) *pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ No loop unrolling is used. */
+ k = srcALen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1U) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1U));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1U);
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pScr2 pointer */
+ pScr2 = pScratch2;
+
+ /* Actual correlation process starts here */
+ blkCnt = (srcALen + srcBLen - 1U) >> 2;
+
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = read_q15x2_ia (&pScr1);
+
+ tapCnt = (srcBLen) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ /* multiply and accumulate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumulate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = read_q15x2_ia (&pScr1);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = read_q15x2_ia (&pScr2);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = read_q15x2_ia (&pScr1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc1 >> 7U, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc2 >> 7U, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc3 >> 7U, 8));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4U;
+ }
+
+ blkCnt = (srcALen + srcBLen - 1U) & 0x3;
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while (blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1U;
+
+ while (tapCnt > 0U)
+ {
+ acc0 += (*pScr1++ * *pScr2++);
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1U;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while (tapCnt > 0U)
+ {
+ /* accumulate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7U, 8));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1U;
+ }
+
+}
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c
new file mode 100644
index 0000000000..2dd16d72cf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q15.c
@@ -0,0 +1,903 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_q15.c
+ * Description: Correlation of Q15 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q15 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q15() for a faster but less precise version of this function.
+ @remark
+ Refer to \ref arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+
+void arm_correlate_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+ const q15_t *pIn1 = pSrcA; /* inputA pointer */
+ const q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const q15_t *pX;
+ const q15_t *pY;
+ const q15_t *pA;
+ const q15_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t inv = 2U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+ int32_t block1, block2, block3;
+ int32_t incr;
+
+ tot = ((srcALen + srcBLen) - 2U);
+ if (srcALen > srcBLen)
+ {
+ /*
+ * Calculating the number of zeros to be padded to the output
+ */
+ j = srcALen - srcBLen;
+ /*
+ * Initialize the pointer after zero padding
+ */
+ pDst += j;
+ }
+ else if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA + (srcALen - 1U);
+ /*
+ * Initialisation of the pointer after zero padding
+ */
+ pDst = pDst + tot;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ /*
+ * Setting the reverse flag
+ */
+ inv = -2;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2;
+ incr = inv / 2;
+
+ for (i = 0U; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ /*
+ * compute 2 accumulators per loop
+ * size is incrementing for second accumulator
+ * Y pointer is decrementing for second accumulator
+ */
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q15(acc0, acc1, pX, pY, count);
+
+ *pDst = (q15_t) acc0;
+ pDst += incr;
+ *pDst = (q15_t) acc1;
+ pDst += incr;
+ pB -= 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
+
+ *pDst = (q15_t) acc;
+ pDst += incr;
+ pB--;
+ }
+
+ for (i = 0U; i <= block2 - 4; i += 4)
+ {
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+ int64_t acc2 = 0LL;
+ int64_t acc3 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q15(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+ *pDst = (q15_t) acc0;
+ pDst += incr;
+ *pDst = (q15_t) acc1;
+ pDst += incr;
+ *pDst = (q15_t) acc2;
+ pDst += incr;
+ *pDst = (q15_t) acc3;
+ pDst += incr;
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q15(acc0, acc1, pX, pY, srcBLen);
+
+ *pDst = (q15_t) acc0;
+ pDst += incr;
+ *pDst = (q15_t) acc1;
+ pDst += incr;
+ pA += 2;
+ }
+
+ if (block2 & 1)
+ {
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, srcBLen);
+
+ *pDst = (q15_t) acc;
+ pDst += incr;
+ pA++;
+ }
+
+ for (i = block3 - 1; i >= 0; i -= 2)
+ {
+
+ uint32_t count = (i + 1);
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is decrementing for second accumulator
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q15(acc0, acc1, pX, pY, count);
+
+ *pDst = (q15_t) acc0;
+ pDst += incr;
+ *pDst = (q15_t) acc1;
+ pDst += incr;
+ pA += 2;
+
+ }
+ for (; i >= 0; i--)
+ {
+ uint32_t count = (i + 1);
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q15(acc, pX, pY, count);
+
+ *pDst = (q15_t) acc;
+ pDst += incr;
+ pA++;
+ }
+}
+
+#else
+void arm_correlate_q15(
+ const q15_t * pSrcA,
+ uint32_t srcALen,
+ const q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+
+#if defined (ARM_MATH_DSP)
+
+ const q15_t *pIn1; /* InputA pointer */
+ const q15_t *pIn2; /* InputB pointer */
+ q15_t *pOut = pDst; /* Output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ const q15_t *px; /* Intermediate inputA pointer */
+ const q15_t *py; /* Intermediate inputB pointer */
+ const q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables for holding input and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (srcALen * 2U) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+ /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum = __SMLALD(*px++, *py++, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((sum >> 15), 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = read_q15x2 ((q15_t *) px);
+
+ /* read x[1], x[2] samples */
+ x1 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the first two inputB samples using SIMD:
+ * y[0] and y[1] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[3], x[4] */
+ x3 = read_q15x2 ((q15_t *) px + 1);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLALD(x3, c0, acc3);
+
+ /* Read y[2] and y[3] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLALD(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLALD(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = read_q15x2 ((q15_t *) px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = read_q15x2 ((q15_t *) px + 3);
+ px += 4U;
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLALD(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLALD(x1, c0, acc3);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ if (k == 1U)
+ {
+ /* Read y[4] */
+ c0 = *py;
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = read_q15x2 ((q15_t *) px);
+ px++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD (x0, c0, acc0);
+ acc1 = __SMLALD (x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if (k == 2U)
+ {
+ /* Read y[4], y[5] */
+ c0 = read_q15x2 ((q15_t *) py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+ px += 2U;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALD(x3, c0, acc2);
+ acc3 = __SMLALD(x2, c0, acc3);
+ }
+
+ if (k == 3U)
+ {
+ /* Read y[4], y[5] */
+ c0 = read_q15x2_ia ((q15_t **) &py);
+
+ /* Read x[7], x[8] */
+ x3 = read_q15x2 ((q15_t *) px);
+
+ /* Read x[9] */
+ x2 = read_q15x2 ((q15_t *) px + 1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALD(x3, c0, acc2);
+ acc3 = __SMLALD(x2, c0, acc3);
+
+ c0 = (*py);
+
+ /* Read y[6] */
+#ifdef ARM_MATH_BIG_ENDIAN
+ c0 = c0 << 16U;
+#else
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = read_q15x2 ((q15_t *) px + 2);
+ px += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD (x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
+ pOut += inc;
+
+ /* Increment the count by 4 as 4 output values are computed */
+ count += 4U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+ sum += ((q63_t) *px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q63_t) *px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(sum >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment count by 1, as one output value is computed */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q63_t) *px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(sum >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1U);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = __SMLALD(read_q15x2_ia ((q15_t **) &px), read_q15x2_ia ((q15_t **) &py), sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum = __SMLALD(*px++, *py++, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((sum >> 15), 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ const q15_t *pIn1 = pSrcA; /* InputA pointer */
+ const q15_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2U);
+
+ if (srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if (srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1U);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+ }
+
+ /* Loop to calculate convolution for output length number of values */
+ for (i = 0U; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ if (inv == 1)
+ *pDst-- = (q15_t) __SSAT((sum >> 15U), 16U);
+ else
+ *pDst++ = (q15_t) __SSAT((sum >> 15U), 16U);
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
new file mode 100644
index 0000000000..a5f15fec7b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q31.c
@@ -0,0 +1,879 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_q31.c
+ * Description: Correlation of Q31 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q31 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ Thus, if the accumulator overflows it wraps around and distorts the result.
+ The input signals should be scaled down to avoid intermediate overflows.
+ Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_correlate_fast_q31() for a faster but less precise implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+#include "arm_vec_filtering.h"
+void arm_correlate_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ const q31_t *pIn1 = pSrcA; /* inputA pointer */
+ const q31_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ /*
+ * Loop to perform MAC operations according to correlation equation
+ */
+ const q31_t *pX;
+ const q31_t *pY;
+ const q31_t *pA;
+ const q31_t *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t inv = 4; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+ int32_t block1, block2, block3;
+ int32_t incr;
+
+ tot = ((srcALen + srcBLen) - 2U);
+ if (srcALen > srcBLen)
+ {
+ /*
+ * Calculating the number of zeros to be padded to the output
+ */
+ j = srcALen - srcBLen;
+ /*
+ * Initialize the pointer after zero padding
+ */
+ pDst += j;
+ }
+ else if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA + (srcALen - 1U);
+ /*
+ * Initialization of the pointer after zero padding
+ */
+ pDst = pDst + tot;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ /*
+ * Setting the reverse flag
+ */
+ inv = -4;
+
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+ pA = pIn1;
+ pB = pIn2;
+ incr = inv / 4;
+
+ for (i = 0U; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ /* compute 2 accumulators per loop */
+ /* size is incrementing for second accumulator */
+ /* Y pointer is decrementing for second accumulator */
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q31(acc0, acc1, pX, pY, count);
+
+ *pDst = (q31_t) acc0;
+ pDst += incr;
+ *pDst = (q31_t) acc1;
+ pDst += incr;
+ pB -= 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count);
+
+ *pDst = (q31_t) acc;
+ pDst += incr;
+ pB--;
+ }
+
+ for (i = 0U; i <= block2 - 4; i += 4)
+ {
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+ int64_t acc2 = 0LL;
+ int64_t acc3 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /* compute 4 accumulators per loop */
+ /* size is fixed for all accumulators */
+ /* X pointer is incrementing for successive accumulators */
+ MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q31(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+ *pDst = (q31_t) acc0;
+ pDst += incr;
+ *pDst = (q31_t) acc1;
+ pDst += incr;
+ *pDst = (q31_t) acc2;
+ pDst += incr;
+ *pDst = (q31_t) acc3;
+ pDst += incr;
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /* compute 2 accumulators per loop */
+ /* size is fixed for all accumulators */
+ /* X pointer is incrementing for second accumulator */
+ MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q31(acc0, acc1, pX, pY, srcBLen);
+
+ *pDst = (q31_t) acc0;
+ pDst += incr;
+ *pDst = (q31_t) acc1;
+ pDst += incr;
+ pA += 2;
+ }
+
+ if (block2 & 1)
+ {
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, srcBLen);
+
+ *pDst = (q31_t) acc;
+ pDst += incr;
+ pA++;
+ }
+
+ for (i = block3 - 1; i >= 0; i -= 2)
+ {
+
+ uint32_t count = (i + 1);
+ int64_t acc0 = 0LL;
+ int64_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /* compute 2 accumulators per loop */
+ /* size is decrementing for second accumulator */
+ /* X pointer is incrementing for second accumulator */
+ MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q31(acc0, acc1, pX, pY, count);
+
+ *pDst = (q31_t) acc0;
+ pDst += incr;
+ *pDst = (q31_t) acc1;
+ pDst += incr;
+ pA += 2;
+
+ }
+ for (; i >= 0; i--)
+ {
+ uint32_t count = (i + 1);
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q31(acc, pX, pY, count);
+
+ *pDst = (q31_t) acc;
+ pDst += incr;
+ pA++;
+ }
+}
+#else
+void arm_correlate_q31(
+ const q31_t * pSrcA,
+ uint32_t srcALen,
+ const q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ const q31_t *pIn1; /* InputA pointer */
+ const q31_t *pIn2; /* InputB pointer */
+ q31_t *pOut = pDst; /* Output pointer */
+ const q31_t *px; /* Intermediate inputA pointer */
+ const q31_t *py; /* Intermediate inputB pointer */
+ const q31_t *pSrc1; /* Intermediate pointers */
+ q63_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1; /* Destination address modifier */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables for holding input and coefficient values */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* x[1] * y[srcBLen - 3] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* x[2] * y[srcBLen - 2] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* x[3] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) *px++ * (*py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unroll by 3 */
+ blkCnt = blockSize2 / 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *px++;
+ x1 = *px++;
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py);
+ /* Read x[2] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[1] * y[0] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[2] * y[0] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read y[1] sample */
+ c0 = *(py + 1U);
+ /* Read x[3] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[1] */
+ acc0 += ((q63_t) x1 * c0);
+ /* acc1 += x[2] * y[1] */
+ acc1 += ((q63_t) x2 * c0);
+ /* acc2 += x[3] * y[1] */
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read y[2] sample */
+ c0 = *(py + 2U);
+ /* Read x[4] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[2] * y[2] */
+ acc0 += ((q63_t) x2 * c0);
+ /* acc1 += x[3] * y[2] */
+ acc1 += ((q63_t) x0 * c0);
+ /* acc2 += x[4] * y[2] */
+ acc2 += ((q63_t) x1 * c0);
+
+ /* update scratch pointers */
+ px += 3U;
+ py += 3U;
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while (k > 0U)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+
+ /* Read x[7] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[5] * y[4] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[6] * y[4] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (acc0 >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q31_t) (acc1 >> 31);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc2 >> 31);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3U;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) *px++ * *py++;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q31_t *pIn1 = pSrcA; /* InputA pointer */
+ const q31_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using correlation but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2U);
+
+ if (srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if (srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1U);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+ }
+
+ /* Loop to calculate correlation for output length number of times */
+ for (i = 0U; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to correlation equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ if (inv == 1)
+ *pDst-- = (q31_t) (sum >> 31U);
+ else
+ *pDst++ = (q31_t) (sum >> 31U);
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c
new file mode 100644
index 0000000000..53a33ce83e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_correlate_q7.c
@@ -0,0 +1,1002 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_correlate_q7.c
+ * Description: Correlation of Q7 sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup Corr
+ @{
+ */
+
+/**
+ @brief Correlation of Q7 sequences.
+ @param[in] pSrcA points to the first input sequence
+ @param[in] srcALen length of the first input sequence
+ @param[in] pSrcB points to the second input sequence
+ @param[in] srcBLen length of the second input sequence
+ @param[out] pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
+
+ @remark
+ Refer to \ref arm_correlate_opt_q7() for a faster implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_helium_utils.h"
+
+#include "arm_vec_filtering.h"
+void arm_correlate_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+ const q7_t *pIn1 = pSrcA; /* inputA pointer */
+ const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* inputB pointer */
+ const q7_t *pX, *pY;
+ const q7_t *pA, *pB;
+ int32_t i = 0U, j = 0; /* loop counters */
+ int32_t inv = 1U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+ int32_t block1, block2, block3;
+ int32_t incr;
+
+ tot = ((srcALen + srcBLen) - 2U);
+ if (srcALen > srcBLen)
+ {
+ /*
+ * Calculating the number of zeros to be padded to the output
+ */
+ j = srcALen - srcBLen;
+ /*
+ * Initialize the pointer after zero padding
+ */
+ pDst += j;
+ }
+ else if (srcALen < srcBLen)
+ {
+ /*
+ * Initialization to inputB pointer
+ */
+ pIn1 = pSrcB;
+ /*
+ * Initialization to the end of inputA pointer
+ */
+ pIn2 = pSrcA + (srcALen - 1U);
+ /*
+ * Initialisation of the pointer after zero padding
+ */
+ pDst = pDst + tot;
+ /*
+ * Swapping the lengths
+ */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+ /*
+ * Setting the reverse flag
+ */
+ inv = -1;
+ }
+
+ block1 = srcBLen - 1;
+ block2 = srcALen - srcBLen + 1;
+ block3 = srcBLen - 1;
+
+ pA = pIn1;
+ pB = pIn2;
+ incr = inv;
+
+ for (i = 0U; i <= block1 - 2; i += 2)
+ {
+ uint32_t count = i + 1;
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+
+ /*
+ * compute 2 accumulators per loop
+ * size is incrementing for second accumulator
+ * Y pointer is decrementing for second accumulator
+ */
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_DUAL_DEC_Y_INC_SIZE_Q7(acc0, acc1, pX, pY, count);
+
+ *pDst = (q7_t) acc0;
+ pDst += incr;
+ *pDst = (q7_t) acc1;
+ pDst += incr;
+ pB -= 2;
+ }
+ for (; i < block1; i++)
+ {
+ uint32_t count = i + 1;
+ int32_t acc = 0;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
+
+ *pDst = (q7_t) acc;
+ pDst += incr;
+ pB--;
+ }
+
+ for (i = 0U; i <= block2 - 4; i += 4)
+ {
+ int32_t acc0 = 0;
+ int32_t acc1 = 0;
+ int32_t acc2 = 0;
+ int32_t acc3 = 0;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 4 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for successive accumulators
+ */
+ MVE_INTR_CORR_QUAD_INC_X_FIXED_SIZE_Q7(acc0, acc1, acc2, acc3, pX, pY, srcBLen);
+
+ *pDst = (q7_t) acc0;
+ pDst += incr;
+ *pDst = (q7_t) acc1;
+ pDst += incr;
+ *pDst = (q7_t) acc2;
+ pDst += incr;
+ *pDst = (q7_t) acc3;
+ pDst += incr;
+
+ pA += 4;
+ }
+
+ for (; i <= block2 - 2; i += 2)
+ {
+ int32_t acc0 = 0LL;
+ int32_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is fixed for all accumulators
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_FIXED_SIZE_Q7(acc0, acc1, pX, pY, srcBLen);
+
+ *pDst = (q7_t) acc0;
+ pDst += incr;
+ *pDst = (q7_t) acc1;
+ pDst += incr;
+ pA += 2;
+ }
+
+ if (block2 & 1)
+ {
+ int32_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, srcBLen);
+
+ *pDst = (q7_t) acc;
+ pDst += incr;
+ pA++;
+ }
+
+ for (i = block3 - 1; i >= 0; i -= 2)
+ {
+ uint32_t count = (i + 1);
+ int32_t acc0 = 0LL;
+ int32_t acc1 = 0LL;
+
+ pX = pA;
+ pY = pB;
+ /*
+ * compute 2 accumulators per loop
+ * size is decrementing for second accumulator
+ * X pointer is incrementing for second accumulator
+ */
+ MVE_INTR_CORR_DUAL_INC_X_DEC_SIZE_Q7(acc0, acc1, pX, pY, count);
+
+ *pDst = (q7_t) acc0;
+ pDst += incr;
+ *pDst = (q7_t) acc1;
+ pDst += incr;
+ pA += 2;
+
+ }
+ for (; i >= 0; i--)
+ {
+ uint32_t count = (i + 1);
+ int64_t acc = 0LL;
+
+ pX = pA;
+ pY = pB;
+ MVE_INTR_CORR_SINGLE_Q7(acc, pX, pY, count);
+
+ *pDst = (q7_t) acc;
+ pDst += incr;
+ pA++;
+ }
+}
+
+#else
+void arm_correlate_q7(
+ const q7_t * pSrcA,
+ uint32_t srcALen,
+ const q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ const q7_t *pIn1; /* InputA pointer */
+ const q7_t *pIn2; /* InputB pointer */
+ q7_t *pOut = pDst; /* Output pointer */
+ const q7_t *px; /* Intermediate inputA pointer */
+ const q7_t *py; /* Intermediate inputB pointer */
+ const q7_t *pSrc1; /* Intermediate pointers */
+ q31_t sum; /* Accumulators */
+ uint32_t blockSize1, blockSize2, blockSize3; /* Loop counters */
+ uint32_t j, k, count, blkCnt; /* Loop counters */
+ uint32_t outBlockSize;
+ int32_t inc = 1;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables for holding input and coefficient values */
+#endif
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if (srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2U * srcALen) - 1U;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1U));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2U);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+ }
+
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1U;
+ blockSize2 = srcALen - (srcBLen - 1U);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1U;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1U);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while (blockSize1 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 4] , y[srcBLen - 3] */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[0] * y[srcBLen - 4] */
+ /* x[1] * y[srcBLen - 3] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 2] , y[srcBLen - 1] */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[2] * y[srcBLen - 2] */
+ /* x[3] * y[srcBLen - 1] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize k with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q31_t) ((q15_t) *px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment MAC count */
+ count++;
+
+ /* Decrement loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0U;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if (srcBLen >= 4U)
+ {
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize2 >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *py++;
+ /* Read y[1] sample */
+ c1 = *py++;
+
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[0] and y[1] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *px++;
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[2] sample */
+ c0 = *py++;
+ /* Read y[3] sample */
+ c1 = *py++;
+
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[2] and y[3] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while (--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4U;
+
+ while (k > 0U)
+ {
+ /* Read y[4] sample */
+ c0 = *py++;
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 += ((q15_t) x0 * c0);
+ /* acc1 += x[5] * y[4] */
+ acc1 += ((q15_t) x1 * c0);
+ /* acc2 += x[6] * y[4] */
+ acc2 += ((q15_t) x2 * c0);
+ /* acc3 += x[7] * y[4] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
+ pOut += inc;
+
+ count += 4U;
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize2 % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize2;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = srcBLen >> 2U;
+
+ while (k > 0U)
+ {
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* Perform the multiply-accumulate */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = srcBLen % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = srcBLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while (blkCnt > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1U;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1U));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while (blockSize3 > 0U)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ k = count >> 2U;
+
+ while (k > 0U)
+ {
+ /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[0] , y[1] */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
+ in1 = (q15_t) *px++;
+ in2 = (q15_t) *px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* y[2] , y[3] */
+ in1 = (q15_t) *py++;
+ in2 = (q15_t) *py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16U);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ k = count % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ k = count;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (k > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) *px++ * *py++);
+
+ /* Decrement loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7U, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement MAC count */
+ count--;
+
+ /* Decrement loop counter */
+ blockSize3--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ const q7_t *pIn1 = pSrcA; /* InputA pointer */
+ const q7_t *pIn2 = pSrcB + (srcBLen - 1U); /* InputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i = 0U, j; /* Loop counters */
+ uint32_t inv = 0U; /* Reverse order flag */
+ uint32_t tot = 0U; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2U);
+
+ if (srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if (srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1U);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0U; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0U; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if (((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - (int32_t) j)]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ if (inv == 1)
+ *pDst-- = (q7_t) __SSAT((sum >> 7U), 8U);
+ else
+ *pDst++ = (q7_t) __SSAT((sum >> 7U), 8U);
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Corr group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
new file mode 100644
index 0000000000..f323eaa1b6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -0,0 +1,951 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_f32.c
+ * Description: FIR decimation for floating-point sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
+
+ These functions combine an FIR filter together with a decimator.
+ They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
+ Conceptually, the functions are equivalent to the block diagram below:
+ \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
+ When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized
+ cutoff frequency of 1/M in order to prevent aliasing distortion.
+ The user of the function is responsible for providing the filter coefficients.
+
+ The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
+ Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the
+ samples output by the decimator are computed.
+ The functions operate on blocks of input and output data.
+ pSrc points to an array of blockSize input values and
+ pDst points to an array of blockSize/M output values.
+ In order to have an integer number of output samples blockSize
+ must always be a multiple of the decimation factor M.
+
+ The library provides separate functions for Q15, Q31 and floating-point data types.
+
+ @par Algorithm:
+ The FIR portion of the algorithm uses the standard form filter:
+
+ y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+
+ where, b[n] are the filter coefficients.
+ @par
+ The pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to a state array of size numTaps + blockSize - 1.
+ Samples in the state buffer are stored in the order:
+ @par
+
+ {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ - Checks to make sure that the size of the input is a multiple of the decimation factor.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, M (decimation factor), pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
+ arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
+ arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
+
+ where M is the decimation factor; numTaps is the number of filter coefficients in the filter;
+ pCoeffs is the address of the coefficient buffer;
+ pState is the address of the state buffer.
+ Be sure to set the values in the state buffer to zeros when doing static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point FIR decimator.
+ @param[in] S points to an instance of the floating-point FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_fir_decimate_f32(
+ const arm_fir_decimate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ const float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ uint32_t blkCntN4;
+ const float32_t *px0, *px1, *px2, *px3;
+ f32x4_t accv, acc0v, acc1v, acc2v, acc3v;
+ f32x4_t x0v, x1v, x2v, x3v;
+ f32x4_t c0v;
+
+ /*
+ * S->pState buffer contains previous frame (numTaps - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + (numTaps - 1U);
+ /*
+ * Total number of output samples to be computed
+ */
+ blkCnt = outBlockSize / 4;
+ blkCntN4 = outBlockSize - (4 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy 4 * decimation factor number of new input samples into the state buffer
+ */
+ i = (4 * S->M) >> 2;
+ do
+ {
+ vst1q(pStateCurnt, vld1q((const float32_t *)pSrc));
+ pSrc += 4;
+ pStateCurnt += 4;
+ i--;
+ }
+ while (i > 0U);
+
+ /*
+ * Set accumulators to zero
+ */
+ acc0v = vdupq_n_f32(0.0f);
+ acc1v = vdupq_n_f32(0.0f);
+ acc2v = vdupq_n_f32(0.0f);
+ acc3v = vdupq_n_f32(0.0f);
+
+ /*
+ * Initialize state pointer for all the samples
+ */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+ /*
+ * Initialize coeff pointer
+ */
+ pb = pCoeffs;
+ /*
+ * Loop unrolling. Process 4 taps at a time.
+ */
+ tapCnt = numTaps >> 2;
+ /*
+ * Loop over the number of taps. Unroll by a factor of 4.
+ * Repeat until we've computed numTaps-4 coefficients.
+ */
+ while (tapCnt > 0U)
+ {
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vld1q((const float32_t *)pb);
+ pb += 4;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vfmaq(acc0v, x0v, c0v);
+ acc1v = vfmaq(acc1v, x1v, c0v);
+ acc2v = vfmaq(acc2v, x2v, c0v);
+ acc3v = vfmaq(acc3v, x3v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ /*
+ * If the filter length is not a multiple of 4, compute the remaining filter taps
+ * should be tail predicated
+ */
+ tapCnt = numTaps % 0x4U;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vldrwq_z_f32(pb, p0);
+ pb += 4;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vfmaq_f32(acc0v, x0v, c0v);
+ acc1v = vfmaq_f32(acc1v, x1v, c0v);
+ acc2v = vfmaq_f32(acc2v, x2v, c0v);
+ acc3v = vfmaq_f32(acc3v, x3v, c0v);
+ }
+
+ /* reduction */
+ accv[0] = vecAddAcrossF32Mve(acc0v);
+ accv[1] = vecAddAcrossF32Mve(acc1v);
+ accv[2] = vecAddAcrossF32Mve(acc2v);
+ accv[3] = vecAddAcrossF32Mve(acc3v);
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples
+ */
+ pState = pState + 4 * S->M;
+ /*
+ * The result is in the accumulator, store in the destination buffer.
+ */
+ vst1q(pDst, accv);
+ pDst += 4;
+
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ while (blkCntN4 > 0U)
+ {
+ /*
+ * Copy decimation factor number of new input samples into the state buffer
+ */
+ i = S->M;
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+ }
+ while (--i);
+ /*
+ * Set accumulator to zero
+ */
+ acc0v = vdupq_n_f32(0.0f);
+ /*
+ * Initialize state pointer
+ */
+ px = pState;
+ /*
+ * Initialize coeff pointer
+ */
+ pb = pCoeffs;
+ /*
+ * Loop unrolling. Process 4 taps at a time.
+ */
+ tapCnt = numTaps >> 2;
+ /*
+ * Loop over the number of taps. Unroll by a factor of 4.
+ * Repeat until we've computed numTaps-4 coefficients.
+ */
+ while (tapCnt > 0U)
+ {
+ c0v = vldrwq_f32(pb);
+ x0v = vldrwq_f32(px);
+ pb += 4;
+ px += 4;
+ acc0v = vfmaq_f32(acc0v, x0v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+ tapCnt = numTaps % 0x4U;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ c0v = vldrwq_z_f32(pb, p0);
+ x0v = vldrwq_f32(px);
+ acc0v = vfmaq_f32(acc0v, x0v, c0v);
+ }
+ accv[0] = vecAddAcrossF32Mve(acc0v);
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * * to process the next group of decimation factor number samples
+ */
+ pState = pState + S->M;
+ /*
+ * The result is in the accumulator, store in the destination buffer.
+ */
+ *pDst++ = accv[0];
+ /*
+ * Decrement the loop counter
+ */
+ blkCntN4--;
+ }
+
+ /*
+ * Processing is complete.
+ * Now copy the last numTaps - 1 samples to the start of the state buffer.
+ * This prepares the state buffer for the next function call.
+ */
+
+ pStateCurnt = S->pState;
+ blkCnt =(numTaps - 1) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pStateCurnt, vldrwq_f32(pState));
+ pState += 4;
+ pStateCurnt += 4;
+ blkCnt--;
+ }
+ blkCnt = (numTaps - 1) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_fir_decimate_f32(
+ const arm_fir_decimate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
+ float32_t sum0; /* Accumulator */
+ float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+ uint32_t blkCntN4;
+ float32_t *px0, *px1, *px2, *px3;
+ float32_t x1, x2, x3;
+
+ float32x4_t accv,acc0v,acc1v,acc2v,acc3v;
+ float32x4_t x0v, x1v, x2v, x3v;
+ float32x4_t c0v;
+ float32x2_t temp;
+ float32x4_t sum0v;
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1U);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 4;
+ blkCntN4 = outBlockSize - (4 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = 4 * S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulators to zero */
+ acc0v = vdupq_n_f32(0.0);
+ acc1v = vdupq_n_f32(0.0);
+ acc2v = vdupq_n_f32(0.0);
+ acc3v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0v = vld1q_f32(pb);
+ pb += 4;
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0v = vld1q_f32(px0);
+ x1v = vld1q_f32(px1);
+ x2v = vld1q_f32(px2);
+ x3v = vld1q_f32(px3);
+
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vmlaq_f32(acc0v, x0v, c0v);
+ acc1v = vmlaq_f32(acc1v, x1v, c0v);
+ acc2v = vmlaq_f32(acc2v, x2v, c0v);
+ acc3v = vmlaq_f32(acc3v, x3v, c0v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ temp = vpadd_f32(vget_low_f32(acc0v),vget_high_f32(acc0v));
+ accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,0);
+
+ temp = vpadd_f32(vget_low_f32(acc1v),vget_high_f32(acc1v));
+ accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,1);
+
+ temp = vpadd_f32(vget_low_f32(acc2v),vget_high_f32(acc2v));
+ accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,2);
+
+ temp = vpadd_f32(vget_low_f32(acc3v),vget_high_f32(acc3v));
+ accv = vsetq_lane_f32(vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1),accv,3);
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ accv = vsetq_lane_f32(vgetq_lane_f32(accv, 0) + x0 * c0,accv,0);
+ accv = vsetq_lane_f32(vgetq_lane_f32(accv, 1) + x1 * c0,accv,1);
+ accv = vsetq_lane_f32(vgetq_lane_f32(accv, 2) + x2 * c0,accv,2);
+ accv = vsetq_lane_f32(vgetq_lane_f32(accv, 3) + x3 * c0,accv,3);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + 4 * S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ vst1q_f32(pDst,accv);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN4 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
+ {
+ c0v = vld1q_f32(pb);
+ pb += 4;
+
+ x0v = vld1q_f32(px);
+ px += 4;
+
+ sum0v = vmlaq_f32(sum0v, x0v, c0v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ temp = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
+ sum0 = vget_lane_f32(temp, 0) + vget_lane_f32(temp, 1);
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1U) >> 2;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ sum0v = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,sum0v);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+#else
+void arm_fir_decimate_f32(
+ const arm_fir_decimate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *px0; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
+ float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ float32_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t *px1, *px2, *px3;
+ float32_t x1, x2, x3;
+ float32_t acc1, acc2, acc3;
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+
+ /* Samples loop unrolled by 4 */
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 4;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = outBlockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0.0f;
+
+ /* Initialize state pointer */
+ px0 = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = acc0;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+
+#endif /*defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
new file mode 100644
index 0000000000..15bbb78a57
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
@@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_fast_q15.c
+ * Description: Fast Q15 FIR Decimator
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 FIR decimator (fast variant).
+ @param[in] S points to an instance of the Q15 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of input samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
+ The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+ @remark
+ Refer to \ref arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_decimate_init_q15() to initialize the filter structure.
+ */
+
+#if defined (ARM_MATH_DSP)
+
+void arm_fir_decimate_fast_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t c1; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch state variables for acc0, acc1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN3 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
+ x0 = read_q15x2_ia (&px);
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
+ c1 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c0, sum0);
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = read_q15x2_ia (&px);
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c1, sum0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c0, sum0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ i = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+}
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+void arm_fir_decimate_fast_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+ px1 = pState + S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the Read b[numTaps-1] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN3 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ i = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+}
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
new file mode 100644
index 0000000000..993f7ac73e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
@@ -0,0 +1,390 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_fast_q31.c
+ * Description: Fast Q31 FIR Decimator
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 FIR decimator (fast variant).
+ @param[in] S points to an instance of the Q31 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are added to a 2.30 accumulator.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
+
+ @remark
+ Refer to \ref arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_decimate_init_q31() to initialize the filter structure.
+ */
+
+void arm_fir_decimate_fast_q31(
+ const arm_fir_decimate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *px0; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t *px1, *px2, *px3;
+ q31_t x1, x2, x3;
+ q63_t acc1, acc2, acc3;
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+
+ /* Samples loop unrolled by 4 */
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 4;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+ *pDst++ = (q31_t) (acc1 << 1);
+ *pDst++ = (q31_t) (acc2 << 1);
+ *pDst++ = (q31_t) (acc3 << 1);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = outBlockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
new file mode 100644
index 0000000000..de31b9f415
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
@@ -0,0 +1,105 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_init_f32.c
+ * Description: Floating-point FIR Decimator initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point FIR decimator.
+ @param[in,out] S points to an instance of the floating-point FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32().
+ M is the decimation factor.
+ */
+
+arm_status arm_fir_decimate_init_f32(
+ arm_fir_decimate_instance_f32 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if ((blockSize % M) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation Factor */
+ S->M = M;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
new file mode 100644
index 0000000000..b43df2a06c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
@@ -0,0 +1,106 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_init_q15.c
+ * Description: Initialization function for the Q15 FIR Decimator
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 FIR decimator.
+ @param[in,out] S points to an instance of the Q15 FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples
+ to the call arm_fir_decimate_q15().
+ M is the decimation factor.
+ */
+
+arm_status arm_fir_decimate_init_q15(
+ arm_fir_decimate_instance_q15 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if ((blockSize % M) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation Factor */
+ S->M = M;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
new file mode 100644
index 0000000000..7a9490a3a6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
@@ -0,0 +1,105 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_init_q31.c
+ * Description: Initialization function for Q31 FIR Decimation filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 FIR decimator.
+ @param[in,out] S points to an instance of the Q31 FIR decimator structure
+ @param[in] numTaps number of coefficients in the filter
+ @param[in] M decimation factor
+ @param[in] pCoeffs points to the filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_LENGTH_ERROR : blockSize is not a multiple of M
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_q31().
+ M is the decimation factor.
+ */
+
+arm_status arm_fir_decimate_init_q31(
+ arm_fir_decimate_instance_q31 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if ((blockSize % M) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation Factor */
+ S->M = M;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c
new file mode 100644
index 0000000000..39e9871121
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q15.c
@@ -0,0 +1,851 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_q15.c
+ * Description: Q15 FIR Decimator
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 FIR decimator.
+ @param[in] S points to an instance of the Q15 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of input samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+
+ @remark
+ Refer to \ref arm_fir_decimate_fast_q15() for a faster but less precise implementation of this function.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_fir_decimate_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ const q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ uint32_t blkCntN4;
+ const q15_t *px0, *px1, *px2, *px3;
+ q63_t acc0v, acc1v, acc2v, acc3v;
+ q15x8_t x0v, x1v, x2v, x3v;
+ q15x8_t c0v;
+
+ /*
+ * S->pState buffer contains previous frame (numTaps - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + (numTaps - 1U);
+ /*
+ * Total number of output samples to be computed
+ */
+ blkCnt = outBlockSize / 4;
+ blkCntN4 = outBlockSize - (4 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * Need extra temp variables as 4 * S->M is not necessarily a multiple of 8
+ * and cause final tail predicated post incremented pointers to jump ahead
+ */
+ const q15_t *pSrcTmp = pSrc;
+ q15_t *pStateCurntTmp = pStateCurnt;
+
+ /*
+ * Copy 4 * decimation factor number of new input samples into the state buffer
+ */
+ i = (4 * S->M) >> 3;
+ while (i > 0U)
+ {
+ vstrhq_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp));
+ pSrcTmp += 8;
+ pStateCurntTmp += 8;
+ i--;
+ }
+ i = (4 * S->M) & 7;
+ if (i > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(i);
+ vstrhq_p_s16(pStateCurntTmp, vldrhq_s16(pSrcTmp), p0);
+ }
+
+ pSrc += (4 * S->M);
+ pStateCurnt += (4 * S->M);
+
+ /*
+ * Clear all accumulators
+ */
+ acc0v = 0LL;
+ acc1v = 0LL;
+ acc2v = 0LL;
+ acc3v = 0LL;
+ /*
+ * Initialize state pointer for all the samples
+ */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+ /*
+ * Initialize coeff. pointer
+ */
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 3;
+ /*
+ * Loop over the number of taps. Unroll by a factor of 4.
+ * Repeat until we've computed numTaps-4 coefficients.
+ */
+ while (tapCnt > 0U)
+ {
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vldrhq_s16(pb);
+ pb += 8;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 8;
+ px1 += 8;
+ px2 += 8;
+ px3 += 8;
+
+ acc0v = vmlaldavaq(acc0v, x0v, c0v);
+ acc1v = vmlaldavaq(acc1v, x1v, c0v);
+ acc2v = vmlaldavaq(acc2v, x2v, c0v);
+ acc3v = vmlaldavaq(acc3v, x3v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ /*
+ * If the filter length is not a multiple of 4, compute the remaining filter taps
+ * should be tail predicated
+ */
+ tapCnt = numTaps & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vldrhq_z_s16(pb, p0);
+ pb += 8;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 8;
+ px1 += 8;
+ px2 += 8;
+ px3 += 8;
+
+ acc0v = vmlaldavaq(acc0v, x0v, c0v);
+ acc1v = vmlaldavaq(acc1v, x1v, c0v);
+ acc2v = vmlaldavaq(acc2v, x2v, c0v);
+ acc3v = vmlaldavaq(acc3v, x3v, c0v);
+ }
+
+ acc0v = asrl(acc0v, 15);
+ acc1v = asrl(acc1v, 15);
+ acc2v = asrl(acc2v, 15);
+ acc3v = asrl(acc3v, 15);
+ /*
+ * store in the destination buffer.
+ */
+ *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) acc1v, 16);;
+ *pDst++ = (q15_t) __SSAT((q31_t) acc2v, 16);;
+ *pDst++ = (q15_t) __SSAT((q31_t) acc3v, 16);;
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples
+ */
+ pState = pState + 4 * S->M;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ while (blkCntN4 > 0U)
+ {
+ /*
+ * Copy decimation factor number of new input samples into the state buffer
+ */
+ i = S->M;
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+ }
+ while (--i);
+ /*
+ * Set accumulator to zero
+ */
+ acc0v = 0LL;
+ /*
+ * Initialize state pointer
+ */
+ px = pState;
+ /*
+ * Initialize coeff. pointer
+ */
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 3;
+ while (tapCnt > 0U)
+ {
+ c0v = vldrhq_s16(pb);
+ x0v = vldrhq_s16(px);
+ pb += 8;
+ px += 8;
+ acc0v = vmlaldavaq(acc0v, x0v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ tapCnt = numTaps & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+ c0v = vldrhq_z_s16(pb, p0);
+ x0v = vldrhq_z_s16(px, p0);
+ acc0v = vmlaldavaq_p(acc0v, x0v, c0v, p0);
+ }
+
+ acc0v = asrl(acc0v, 15);
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples
+ */
+ pState = pState + S->M;
+ /*
+ * The result is in the accumulator, store in the destination buffer.
+ */
+ *pDst++ = (q15_t) __SSAT((q31_t) acc0v, 16);
+ /*
+ * Decrement the loop counter
+ */
+ blkCntN4--;
+ }
+
+ /*
+ * Processing is complete.
+ * Now copy the last numTaps - 1 samples to the start of the state buffer.
+ * This prepares the state buffer for the next function call.
+ */
+
+ pStateCurnt = S->pState;
+ blkCnt = (numTaps - 1) >> 3;
+ while (blkCnt > 0U)
+ {
+ vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
+ pState += 8;
+ pStateCurnt += 8;
+ blkCnt--;
+ }
+ blkCnt = (numTaps - 1) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
+ }
+}
+#else
+#if defined (ARM_MATH_DSP)
+
+void arm_fir_decimate_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t sum0; /* Accumulators */
+ q63_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t c1; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = read_q15x2_ia (&px0);
+ x1 = read_q15x2_ia (&px1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch state variables for acc0, acc1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN3 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2] sample */
+ x0 = read_q15x2_ia (&px);
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficients */
+ c1 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLALD(x0, c0, sum0);
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = read_q15x2_ia (&px);
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLALD(x0, c1, sum0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLALD(x0, c0, sum0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+ i = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* Copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+}
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+void arm_fir_decimate_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t sum0; /* Accumulators */
+ q63_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 2 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 2;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+ px1 = pState + S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the Read b[numTaps-1] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ while (blkCntN3 > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ i = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1U) % 0x04U;
+
+ /* copy data */
+ while (i > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+}
+
+#endif /* #if defined (ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
new file mode 100644
index 0000000000..77bfc1686f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_decimate_q31.c
@@ -0,0 +1,634 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_decimate_q31.c
+ * Description: Q31 FIR Decimator
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_decimate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 FIR decimator.
+ @param[in] S points to an instance of the Q31 FIR decimator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
+ After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+
+ @remark
+ Refer to \ref arm_fir_decimate_fast_q31() for a faster but less precise implementation of this function.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_fir_decimate_q31(
+ const arm_fir_decimate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ const q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ uint32_t blkCntN4;
+ const q31_t *px0, *px1, *px2, *px3;
+ q63_t acc0v, acc1v, acc2v, acc3v;
+ q31x4_t x0v, x1v, x2v, x3v;
+ q31x4_t c0v;
+
+ /*
+ * S->pState buffer contains previous frame (numTaps - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + (numTaps - 1U);
+ /*
+ * Total number of output samples to be computed
+ */
+ blkCnt = outBlockSize / 4;
+ blkCntN4 = outBlockSize - (4 * blkCnt);
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy 4 * decimation factor number of new input samples into the state buffer
+ */
+ i = (4 * S->M) >> 2;
+ do
+ {
+ vst1q(pStateCurnt, vldrwq_s32(pSrc));
+ pSrc += 4;
+ pStateCurnt += 4;
+ i--;
+ }
+ while (i > 0U);
+
+ /*
+ * Clear all accumulators
+ */
+ acc0v = 0LL;
+ acc1v = 0LL;
+ acc2v = 0LL;
+ acc3v = 0LL;
+ /*
+ * Initialize state pointer for all the samples
+ */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+ /*
+ * Initialize coeff. pointer
+ */
+ pb = pCoeffs;
+ /*
+ * Loop unrolling. Process 4 taps at a time.
+ */
+ tapCnt = numTaps >> 2;
+ /*
+ * Loop over the number of taps. Unroll by a factor of 4.
+ * Repeat until we've computed numTaps-4 coefficients.
+ */
+ while (tapCnt > 0U)
+ {
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vldrwq_s32(pb);
+ pb += 4;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
+ acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
+ acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
+ acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ /*
+ * If the filter length is not a multiple of 4, compute the remaining filter taps
+ * should be tail predicated
+ */
+ tapCnt = numTaps % 0x4U;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ /*
+ * Read the b[numTaps-1] coefficient
+ */
+ c0v = vldrwq_z_s32(pb, p0);
+ pb += 4;
+ /*
+ * Read x[n-numTaps-1] sample for acc0
+ */
+ x0v = vld1q(px0);
+ x1v = vld1q(px1);
+ x2v = vld1q(px2);
+ x3v = vld1q(px3);
+ px0 += 4;
+ px1 += 4;
+ px2 += 4;
+ px3 += 4;
+
+ acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
+ acc1v = vrmlaldavhaq(acc1v, x1v, c0v);
+ acc2v = vrmlaldavhaq(acc2v, x2v, c0v);
+ acc3v = vrmlaldavhaq(acc3v, x3v, c0v);
+ }
+
+ acc0v = asrl(acc0v, 31 - 8);
+ acc1v = asrl(acc1v, 31 - 8);
+ acc2v = asrl(acc2v, 31 - 8);
+ acc3v = asrl(acc3v, 31 - 8);
+ /*
+ * store in the destination buffer.
+ */
+ *pDst++ = (q31_t) acc0v;
+ *pDst++ = (q31_t) acc1v;
+ *pDst++ = (q31_t) acc2v;
+ *pDst++ = (q31_t) acc3v;
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples
+ */
+ pState = pState + 4 * S->M;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ while (blkCntN4 > 0U)
+ {
+ /*
+ * Copy decimation factor number of new input samples into the state buffer
+ */
+ i = S->M;
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+ }
+ while (--i);
+ /*
+ * Set accumulator to zero
+ */
+ acc0v = 0LL;
+ /*
+ * Initialize state pointer
+ */
+ px = pState;
+ /*
+ * Initialize coeff. pointer
+ */
+ pb = pCoeffs;
+ /*
+ * Loop unrolling. Process 4 taps at a time.
+ */
+ tapCnt = numTaps >> 2;
+ /*
+ * Loop over the number of taps. Unroll by a factor of 4.
+ * Repeat until we've computed numTaps-4 coefficients.
+ */
+ while (tapCnt > 0U)
+ {
+ c0v = vldrwq_s32(pb);
+ x0v = vldrwq_s32(px);
+ pb += 4;
+ px += 4;
+ acc0v = vrmlaldavhaq(acc0v, x0v, c0v);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+ tapCnt = numTaps % 0x4U;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ c0v = vldrwq_z_s32(pb, p0);
+ x0v = vldrwq_z_s32(px, p0);
+ acc0v = vrmlaldavhaq_p(acc0v, x0v, c0v, p0);
+ }
+ acc0v = asrl(acc0v, 31 - 8);
+
+ /*
+ * Advance the state pointer by the decimation factor
+ * * to process the next group of decimation factor number samples
+ */
+ pState = pState + S->M;
+ /*
+ * The result is in the accumulator, store in the destination buffer.
+ */
+ *pDst++ = (q31_t) acc0v;
+ /*
+ * Decrement the loop counter
+ */
+ blkCntN4--;
+ }
+
+ /*
+ * Processing is complete.
+ * Now copy the last numTaps - 1 samples to the start of the state buffer.
+ * This prepares the state buffer for the next function call.
+ */
+ pStateCurnt = S->pState;
+ blkCnt = (numTaps - 1) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pStateCurnt, vldrwq_s32(pState));
+ pState += 4;
+ pStateCurnt += 4;
+ blkCnt--;
+ }
+ blkCnt = (numTaps - 1) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_s32(pStateCurnt, vldrwq_s32(pState), p0);
+ }
+}
+#else
+void arm_fir_decimate_q31(
+ const arm_fir_decimate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *px0; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t *px1, *px2, *px3;
+ q31_t x1, x2, x3;
+ q63_t acc1, acc2, acc3;
+#endif
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (numTaps - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 samples at a time */
+ blkCnt = outBlockSize >> 2U;
+
+ /* Samples loop unrolled by 4 */
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = S->M * 4;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 4;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31);
+ *pDst++ = (q31_t) (acc1 >> 31);
+ *pDst++ = (q31_t) (acc2 >> 31);
+ *pDst++ = (q31_t) (acc3 >> 31);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining samples */
+ blkCnt = outBlockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = outBlockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCur++ = *pSrc++;
+
+ } while (--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+ @} end of FIR_decimate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c
new file mode 100644
index 0000000000..dbbe9597c9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f16.c
@@ -0,0 +1,940 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_f16.c
+ * Description: Floating-point FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+ @ingroup groupFilters
+ */
+
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point FIR filter.
+ @param[in] S points to an instance of the floating-point FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define FIR_F32_MAX_COEF_BLK 8
+
+#define FIR_F16_CORE(pSamples, c, NB_TAPS) \
+ vecAcc0 = vdupq_n_f16(0.0f16); \
+ for (int i = 0; i < NB_TAPS; i++) { \
+ vecIn0 = vld1q(&pSamples[i]); \
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
+ }
+
+#define NB_TAPS 4
+__STATIC_INLINE void arm_fir_f16_1_4_mve(const arm_fir_instance_f16 * S,
+ const float16_t * __restrict pSrc,
+ float16_t * __restrict pDst, uint32_t blockSize)
+{
+ float16_t *pState = S->pState; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float16_t *pStateCur; /* Points to the current sample of the state */
+ const float16_t *pSamples; /* Temporary pointer to the sample buffer */
+ float16_t *pOutput; /* Temporary pointer to the output buffer */
+ const float16_t *pTempSrc; /* Temporary pointer to the source data */
+ float16_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ float16x8_t vecIn0;
+ float16x8_t vecAcc0;
+ float16_t c[NB_TAPS];
+
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ /*
+ * Copy new data into state so that we obtain a continuous sample buffer
+ * containing both the tail end of the old data and the new data.
+ */
+ pSamples = pState;
+ pTempSrc = pSrc;
+ pOutput = pDst;
+
+ for (int i = 0; i < NB_TAPS; i++)
+ c[i] = pCoeffs[i];
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0) {
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+ vst1q(pOutput, vecAcc0);
+
+ pOutput += 8;
+ pSamples += 8;
+
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 7;
+ if (blkCnt)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+ vstrhq_p_f16(pOutput, vecAcc0, p0);
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps >> 3;
+ while (blkCnt > 0) {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 8;
+ pTempDest += 8;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 7;
+ if (blkCnt > 0) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+ }
+
+}
+#undef NB_TAPS
+
+#define NB_TAPS 8
+__STATIC_INLINE void arm_fir_f16_5_8_mve(const arm_fir_instance_f16 * S,
+ const float16_t * __restrict pSrc,
+ float16_t * __restrict pDst, uint32_t blockSize)
+{
+ float16_t *pState = S->pState; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float16_t *pStateCur; /* Points to the current sample of the state */
+ const float16_t *pSamples; /* Temporary pointer to the sample buffer */
+ float16_t *pOutput; /* Temporary pointer to the output buffer */
+ const float16_t *pTempSrc; /* Temporary pointer to the source data */
+ float16_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ float16x8_t vecIn0;
+ float16x8_t vecAcc0;
+ float16_t c[NB_TAPS];
+
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ /*
+ * Copy new data into state so that we obtain a continuous sample buffer
+ * containing both the tail end of the old data and the new data.
+ */
+ pSamples = pState;
+ pTempSrc = pSrc;
+ pOutput = pDst;
+
+ for (int i = 0; i < NB_TAPS; i++)
+ c[i] = pCoeffs[i];
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0) {
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+ vst1q(pOutput, vecAcc0);
+
+ pOutput += 8;
+ pSamples += 8;
+
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 7;
+ if (blkCnt)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ FIR_F16_CORE(pSamples, c, NB_TAPS);
+
+ vstrhq_p_f16(pOutput, vecAcc0, p0);
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps >> 3;
+ while (blkCnt > 0) {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 8;
+ pTempDest += 8;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 7;
+ if (blkCnt > 0) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+ }
+
+}
+#undef NB_TAPS
+
+void arm_fir_f16(const arm_fir_instance_f16 * S,
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ float16_t *pRefStatePtr = S->pState + ROUND_UP(blockSize, 8);
+ float16_t *pState = pRefStatePtr ; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float16_t *pSamples; /* Temporary pointer to the sample buffer */
+ float16_t *pOutput; /* Temporary pointer to the output buffer */
+ const float16_t *pTempSrc; /* Temporary pointer to the source data */
+ float16_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ float16_t c0, c1, c2, c3;
+ float16_t c4, c5, c6, c7;
+
+ /*
+ * [1 to 8 taps] specialized routines
+ */
+ if (numTaps <= 4) {
+ arm_fir_f16_1_4_mve(S, pSrc, pDst, blockSize);
+ return;
+ } else if (numTaps <= 8) {
+ arm_fir_f16_5_8_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+
+ pTempSrc = pSrc;
+ pTempDest = &(pState[(numTaps - 1u)]);
+ int cnt = blockSize;
+ do {
+ mve_pred16_t p0 = vctp16q(cnt);
+ vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+ pTempDest += 8;
+ pTempSrc += 8;
+ cnt -= 8;
+ } while (cnt > 0);
+
+ float16_t *partial_accu_ptr = S->pState;
+
+ pSamples = pState;
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 8;
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 8;
+ }
+
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+ }
+
+ int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+ int sample_offset = FIR_F32_MAX_COEF_BLK;
+ while (localTaps > FIR_F32_MAX_COEF_BLK) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ partial_accu_ptr = S->pState;
+ pSamples = pState + sample_offset;
+ int cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 8;
+ vecAcc0 += vld1q_f16(partial_accu_ptr);
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 8;
+ }
+
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vecAcc0 += vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(partial_accu_ptr, vecAcc0,p0);
+ }
+
+ localTaps -= FIR_F32_MAX_COEF_BLK;
+ sample_offset += FIR_F32_MAX_COEF_BLK;
+ }
+
+ pSamples = pState + sample_offset;
+
+ if (localTaps > 4) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+ pOutput = pDst;
+
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 8;
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0 + pap);
+ cnt--;
+ partial_accu_ptr += 8;
+ pOutput += 8;
+ }
+
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+ pOutput += cnt;
+ }
+
+ } else {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ pOutput = pDst;
+
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 3;
+ while (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ pSamples += 8;
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0 + pap);
+ cnt--;
+ partial_accu_ptr += 8;
+ pOutput += 8;
+ }
+
+ cnt = blockSize & 7;
+ if (cnt > 0) {
+ float16x8_t vecAcc0;
+ float16x8_t vecIn0;
+
+ mve_pred16_t p0 = vctp16q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ float16x8_t pap = vld1q_f16(partial_accu_ptr);
+ vstrhq_p_f16(pOutput, vecAcc0 + pap, p0);
+ pOutput += cnt;
+ }
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps >> 3;
+ while (blkCnt > 0U) {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 8;
+ pTempDest += 8;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_f16(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+
+#else
+
+void arm_fir_f16(
+ const arm_fir_instance_f16 * S,
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ float16_t *pState = S->pState; /* State pointer */
+ const float16_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float16_t *pStateCurnt; /* Points to the current sample of the state */
+ float16_t *px; /* Temporary pointer for state buffer */
+ const float16_t *pb; /* Temporary pointer for coefficient buffer */
+ _Float16 acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ _Float16 acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
+ _Float16 x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
+ _Float16 c0; /* Temporary variable to hold coefficient value */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 8 output values simultaneously.
+ * The variables acc0 ... acc7 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+ acc4 = 0.0f;
+ acc5 = 0.0f;
+ acc6 = 0.0f;
+ acc7 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* This is separated from the others to avoid
+ * a call to __aeabi_memmove which would be slower
+ */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+ x3 = *px++;
+ x4 = *px++;
+ x5 = *px++;
+ x6 = *px++;
+
+ /* Loop unrolling: process 8 taps at a time. */
+ tapCnt = numTaps >> 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x7 = *(px++);
+
+ /* acc0 += b[numTaps-1] * x[n-numTaps] */
+ acc0 += x0 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
+ acc2 += x2 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
+ acc3 += x3 * c0;
+
+ /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
+ acc4 += x4 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
+ acc5 += x5 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
+ acc6 += x6 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
+ acc7 += x7 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c0;
+ acc1 += x2 * c0;
+ acc2 += x3 * c0;
+ acc3 += x4 * c0;
+ acc4 += x5 * c0;
+ acc5 += x6 * c0;
+ acc6 += x7 * c0;
+ acc7 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x2 * c0;
+ acc1 += x3 * c0;
+ acc2 += x4 * c0;
+ acc3 += x5 * c0;
+ acc4 += x6 * c0;
+ acc5 += x7 * c0;
+ acc6 += x0 * c0;
+ acc7 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x3 * c0;
+ acc1 += x4 * c0;
+ acc2 += x5 * c0;
+ acc3 += x6 * c0;
+ acc4 += x7 * c0;
+ acc5 += x0 * c0;
+ acc6 += x1 * c0;
+ acc7 += x2 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x3 = *(px++);
+ /* Perform the multiply-accumulates */
+ acc0 += x4 * c0;
+ acc1 += x5 * c0;
+ acc2 += x6 * c0;
+ acc3 += x7 * c0;
+ acc4 += x0 * c0;
+ acc5 += x1 * c0;
+ acc6 += x2 * c0;
+ acc7 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x4 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x5 * c0;
+ acc1 += x6 * c0;
+ acc2 += x7 * c0;
+ acc3 += x0 * c0;
+ acc4 += x1 * c0;
+ acc5 += x2 * c0;
+ acc6 += x3 * c0;
+ acc7 += x4 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x5 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x6 * c0;
+ acc1 += x7 * c0;
+ acc2 += x0 * c0;
+ acc3 += x1 * c0;
+ acc4 += x2 * c0;
+ acc5 += x3 * c0;
+ acc6 += x4 * c0;
+ acc7 += x5 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x6 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x7 * c0;
+ acc1 += x0 * c0;
+ acc2 += x1 * c0;
+ acc3 += x2 * c0;
+ acc4 += x3 * c0;
+ acc5 += x4 * c0;
+ acc6 += x5 * c0;
+ acc7 += x6 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = numTaps % 0x8U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x7 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+ acc4 += x4 * c0;
+ acc5 += x5 * c0;
+ acc6 += x6 * c0;
+ acc7 += x7 * c0;
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+ x3 = x4;
+ x4 = x5;
+ x5 = x6;
+ x6 = x7;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 8 to process the next group of 8 samples */
+ pState = pState + 8;
+
+ /* The results in the 8 accumulators, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+ *pDst++ = acc4;
+ *pDst++ = acc5;
+ *pDst++ = acc6;
+ *pDst++ = acc7;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x8U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc0 += *px++ * *pb++;
+
+ i--;
+ }
+
+ /* Store result in destination buffer. */
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+* @} end of FIR group
+*/
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c
new file mode 100644
index 0000000000..b96e4034b9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_f32.c
@@ -0,0 +1,1285 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_f32.c
+ * Description: Floating-point FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup FIR Finite Impulse Response (FIR) Filters
+
+ This set of functions implements Finite Impulse Response (FIR) filters
+ for Q7, Q15, Q31, and floating-point data types. Fast versions of Q15 and Q31 are also provided.
+ The functions operate on blocks of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst points to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ The FIR filter algorithm is based upon a sequence of multiply-accumulate (MAC) operations.
+ Each filter coefficient b[n] is multiplied by a state variable which equals a previous input sample x[n].
+
+ y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+
+ @par
+ \image html FIR.GIF "Finite Impulse Response filter"
+ @par
+ pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to a state array of size numTaps + blockSize - 1.
+ Samples in the state buffer are stored in the following order.
+ @par
+
+ {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[n](==pSrc[0]), x[n+1](==pSrc[1]), ..., x[n+blockSize-1](==pSrc[blockSize-1])}
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1.
+ The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 4 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 4 different data type filter instance structures
+
+ arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
+ arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
+
+ where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer.
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be padded with zero to contain
+ a full number of lanes.
+
+ The array length L must be a multiple of x. L = x * a :
+ - x is 4 for f32
+ - x is 4 for q31
+ - x is 4 for f16 (so managed like the f32 version and not like the q15 one)
+ - x is 8 for q15
+ - x is 16 for q7
+
+ The additional coefficients
+ (x * a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first A samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ @par
+ So the state buffer has size numTaps + A + blockSize - 1 :
+ - A is blockSize for f32
+ - A is 8*ceil(blockSize/8) for f16
+ - A is 8*ceil(blockSize/4) for q31
+ - A is 0 for other datatypes (q15 and q7)
+
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point FIR filter.
+ @param[in] S points to an instance of the floating-point FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define FIR_F32_MAX_COEF_BLK 8
+
+#define FIR_F32_CORE(pSamples, c, NB_TAPS) \
+ vecAcc0 = vdupq_n_f32(0.0f); \
+ for (int i = 0; i < NB_TAPS; i++) { \
+ vecIn0 = vld1q(&pSamples[i]); \
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c[i]); \
+ }
+
+
+#define NB_TAPS 4
+__STATIC_INLINE void arm_fir_f32_1_4_mve(const arm_fir_instance_f32 * S,
+ const float32_t * __restrict pSrc,
+ float32_t * __restrict pDst, uint32_t blockSize)
+{
+ float32_t *pRefStatePtr = S->pState + blockSize;
+ float32_t *pState = pRefStatePtr; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
+ float32_t *pOutput; /* Temporary pointer to the output buffer */
+ const float32_t *pTempSrc; /* Temporary pointer to the source data */
+ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ float32x4_t vecIn0;
+ float32x4_t vecAcc0;
+ float32_t c[NB_TAPS];
+ const float32_t *pCoeffsCur = pCoeffs;
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pTempSrc = pSrc;
+
+ pSamples = pState;
+ pOutput = pDst;
+
+ for (int i = 0; i < NB_TAPS; i++)
+ c[i] = *pCoeffsCur++;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0) {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ FIR_F32_CORE(pSamples, c, NB_TAPS);
+
+ vst1q(pOutput, vecAcc0);
+
+ pOutput += 4;
+ pSamples += 4;
+
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 3;
+ if (blkCnt)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ FIR_F32_CORE(pSamples, c, NB_TAPS);
+
+ vstrwq_p_f32(pOutput, vecAcc0, p0);
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps - 1;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+
+ vstrwq_p_f32(pTempDest, vldrwq_z_f32(pTempSrc, p), p);
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+}
+#undef NB_TAPS
+
+__STATIC_INLINE void arm_fir_f32_5_8_mve(const arm_fir_instance_f32 * S,
+ const float32_t * __restrict pSrc,
+ float32_t * __restrict pDst, uint32_t blockSize)
+{
+ float32_t *pRefStatePtr = S->pState + blockSize;
+ float32_t *pState = pRefStatePtr; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
+ const float32_t *pTempSrc; /* Temporary pointer to the source data */
+ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ float32_t c0, c1, c2, c3;
+ float32_t c4, c5, c6, c7;
+
+
+ pTempSrc = pSrc;
+ pTempDest = &(pState[(numTaps - 1u)]);
+ int cnt = blockSize;
+ do {
+ mve_pred16_t p0 = vctp32q(cnt);
+ vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+ pTempDest += 4;
+ pTempSrc += 4;
+ cnt -= 4;
+ } while(cnt > 0);
+
+
+
+ pSamples = pState;
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ cnt = blockSize >> 2;
+ while(cnt > 0)
+ {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 4;
+ vst1q(pDst, vecAcc0);
+ cnt--;
+ pDst += 4;
+ }
+
+ cnt = blockSize & 3;
+ if (cnt > 0)
+ {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ mve_pred16_t p0 = vctp32q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vstrwq_p_f32(pDst, vecAcc0,p0);
+ }
+
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+ blkCnt = numTaps;
+ while (blkCnt > 0)
+ {
+ *pTempDest++ = *pTempSrc++;
+ blkCnt--;
+ }
+}
+
+
+
+void arm_fir_f32(
+const arm_fir_instance_f32 * S,
+const float32_t * pSrc,
+float32_t * pDst,
+uint32_t blockSize)
+{
+ /*
+ S->pState is the arm_fir_partial_accu
+ S->pState + blockSize is the FIR state
+ */
+ float32_t *pRefStatePtr = S->pState + blockSize;
+ float32_t *pState = pRefStatePtr ; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ const float32_t *pSamples; /* Temporary pointer to the sample buffer */
+ float32_t *pOutput; /* Temporary pointer to the output buffer */
+ const float32_t *pTempSrc; /* Temporary pointer to the source data */
+ float32_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ float32_t c0, c1, c2, c3;
+ float32_t c4, c5, c6, c7;
+
+ /*
+ * [1 to 8 taps] specialized routines
+ */
+ if (numTaps <= 4)
+ {
+ arm_fir_f32_1_4_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 8)
+ {
+ arm_fir_f32_5_8_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+
+ pTempSrc = pSrc;
+ pTempDest = &(pState[(numTaps - 1u)]);
+ int cnt = blockSize;
+ do {
+ mve_pred16_t p0 = vctp32q(cnt);
+ vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+ pTempDest += 4;
+ pTempSrc += 4;
+ cnt -= 4;
+ } while(cnt > 0);
+
+ float32_t *partial_accu_ptr = S->pState;
+
+ pSamples = pState;
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ cnt = blockSize >> 2;
+ while(cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 4;
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 4;
+ }
+
+ cnt = blockSize & 3;
+ if (cnt > 0)
+ {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ mve_pred16_t p0 = vctp32q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+ }
+
+ int localTaps = numTaps - FIR_F32_MAX_COEF_BLK;
+ int sample_offset = FIR_F32_MAX_COEF_BLK;
+ while (localTaps > FIR_F32_MAX_COEF_BLK) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+
+ partial_accu_ptr = S->pState;
+ pSamples = pState + sample_offset;
+ int cnt = blockSize >> 2;
+ while(cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 4;
+ vecAcc0 += vld1q_f32(partial_accu_ptr);
+ vst1q(partial_accu_ptr, vecAcc0);
+ cnt--;
+ partial_accu_ptr += 4;
+ }
+
+ cnt = blockSize & 3;
+ if (cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ mve_pred16_t p0 = vctp32q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ vecAcc0 += vld1q_f32(partial_accu_ptr);
+ vstrwq_p_f32(partial_accu_ptr, vecAcc0,p0);
+ }
+
+ localTaps -= FIR_F32_MAX_COEF_BLK;
+ sample_offset += FIR_F32_MAX_COEF_BLK;
+ }
+
+ pSamples = pState + sample_offset;
+
+ if (localTaps > 4) {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ c4 = *pCoeffs++;
+ c5 = *pCoeffs++;
+ c6 = *pCoeffs++;
+ c7 = *pCoeffs++;
+ pOutput = pDst;
+
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 2;
+ while(cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ pSamples += 4;
+ float32x4_t pap = vld1q_f32(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0+pap);
+ cnt--;
+ partial_accu_ptr += 4;
+ pOutput += 4;
+ }
+
+ cnt = blockSize & 3;
+ if (cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ mve_pred16_t p0 = vctp32q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ vecIn0 = vld1q(&pSamples[4]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c4);
+ vecIn0 = vld1q(&pSamples[5]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c5);
+ vecIn0 = vld1q(&pSamples[6]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c6);
+ vecIn0 = vld1q(&pSamples[7]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c7);
+ float32x4_t pap = vld1q_f32(partial_accu_ptr);
+ vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+ pOutput += cnt;
+ }
+ }
+ else {
+ c0 = *pCoeffs++;
+ c1 = *pCoeffs++;
+ c2 = *pCoeffs++;
+ c3 = *pCoeffs++;
+ pOutput = pDst;
+
+ partial_accu_ptr = S->pState;
+ cnt = blockSize >> 2;
+ while(cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ pSamples += 4;
+ float32x4_t pap = vld1q_f32(partial_accu_ptr);
+ vst1q(pOutput, vecAcc0+pap);
+ cnt--;
+ partial_accu_ptr += 4;
+ pOutput += 4;
+ }
+
+ cnt = blockSize & 3;
+ if (cnt > 0) {
+ float32x4_t vecAcc0;
+ float32x4_t vecIn0;
+
+ mve_pred16_t p0 = vctp32q(cnt);
+
+ vecIn0 = vld1q(pSamples);
+ vecAcc0 = vmulq(vecIn0, c0);
+ vecIn0 = vld1q(&pSamples[1]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c1);
+ vecIn0 = vld1q(&pSamples[2]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c2);
+ vecIn0 = vld1q(&pSamples[3]);
+ vecAcc0 = vfmaq(vecAcc0, vecIn0, c3);
+ float32x4_t pap = vld1q_f32(partial_accu_ptr);
+ vstrwq_p_f32(pOutput, vecAcc0+pap,p0);
+ pOutput += cnt;
+ }
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pRefStatePtr[blockSize];
+ pTempDest = pRefStatePtr;
+
+ blkCnt = numTaps >> 2;
+ while (blkCnt > 0)
+ {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 3;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_f32(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+void arm_fir_f32(
+const arm_fir_instance_f32 * S,
+const float32_t * pSrc,
+float32_t * pDst,
+uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px; /* Temporary pointers for state buffer */
+ const float32_t *pb; /* Temporary pointers for coefficient buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ float32x4_t accv0,accv1,samples0,samples1,x0,x1,x2,xa,xb,b;
+ float32_t acc;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* Loop unrolling */
+ blkCnt = blockSize >> 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 8 samples at a time into state buffers */
+ samples0 = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,samples0);
+
+ pStateCurnt += 4;
+ pSrc += 4 ;
+
+ samples1 = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,samples1);
+
+ pStateCurnt += 4;
+ pSrc += 4 ;
+
+ /* Set the accumulators to zero */
+ accv0 = vdupq_n_f32(0);
+ accv1 = vdupq_n_f32(0);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Loop unroling */
+ i = numTaps >> 2;
+
+ /* Perform the multiply-accumulates */
+ x0 = vld1q_f32(px);
+ x1 = vld1q_f32(px + 4);
+
+ while(i > 0)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ x2 = vld1q_f32(px + 8);
+ b = vld1q_f32(pb);
+ xa = x0;
+ xb = x1;
+ accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 0));
+ accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 0));
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 1));
+ accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 1));
+
+ xa = vextq_f32(x0,x1,2);
+ xb = vextq_f32(x1,x2,2);
+
+ accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 2));
+ accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 2));
+
+ xa = vextq_f32(x0,x1,3);
+ xb = vextq_f32(x1,x2,3);
+
+ accv0 = vmlaq_n_f32(accv0,xa,vgetq_lane_f32(b, 3));
+ accv1 = vmlaq_n_f32(accv1,xb,vgetq_lane_f32(b, 3));
+
+ pb += 4;
+ x0 = x1;
+ x1 = x2;
+ px += 4;
+ i--;
+
+ }
+
+ /* Tail */
+ i = numTaps & 3;
+ x2 = vld1q_f32(px + 8);
+
+ /* Perform the multiply-accumulates */
+ switch(i)
+ {
+ case 3:
+ {
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,2);
+ xb = vextq_f32(x1,x2,2);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ }
+ break;
+ case 2:
+ {
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ pb++;
+
+ xa = vextq_f32(x0,x1,1);
+ xb = vextq_f32(x1,x2,1);
+
+ accv0 = vmlaq_n_f32(accv0,xa,*pb);
+ accv1 = vmlaq_n_f32(accv1,xb,*pb);
+
+ }
+ break;
+ case 1:
+ {
+
+ accv0 = vmlaq_n_f32(accv0,x0,*pb);
+ accv1 = vmlaq_n_f32(accv1,x1,*pb);
+
+ }
+ break;
+ default:
+ break;
+ }
+
+ /* The result is stored in the destination buffer. */
+ vst1q_f32(pDst,accv0);
+ pDst += 4;
+ vst1q_f32(pDst,accv1);
+ pDst += 4;
+
+ /* Advance state pointer by 8 for the next 8 samples */
+ pState = pState + 8;
+
+ blkCnt--;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x7;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc += *px++ * *pb++;
+ i--;
+
+ } while (i > 0U);
+
+ /* The result is stored in the destination buffer. */
+ *pDst++ = acc;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy numTaps number of values */
+ tapCnt = numTaps - 1U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+#else
+void arm_fir_f32(
+ const arm_fir_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px; /* Temporary pointer for state buffer */
+ const float32_t *pb; /* Temporary pointer for coefficient buffer */
+ float32_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
+ float32_t x0, x1, x2, x3, x4, x5, x6, x7; /* Temporary variables to hold state values */
+ float32_t c0; /* Temporary variable to hold coefficient value */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 8 output values simultaneously.
+ * The variables acc0 ... acc7 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+ acc4 = 0.0f;
+ acc5 = 0.0f;
+ acc6 = 0.0f;
+ acc7 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* This is separated from the others to avoid
+ * a call to __aeabi_memmove which would be slower
+ */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Read the first 7 samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+ x3 = *px++;
+ x4 = *px++;
+ x5 = *px++;
+ x6 = *px++;
+
+ /* Loop unrolling: process 8 taps at a time. */
+ tapCnt = numTaps >> 3U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x7 = *(px++);
+
+ /* acc0 += b[numTaps-1] * x[n-numTaps] */
+ acc0 += x0 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
+ acc2 += x2 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
+ acc3 += x3 * c0;
+
+ /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
+ acc4 += x4 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
+ acc5 += x5 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
+ acc6 += x6 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
+ acc7 += x7 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c0;
+ acc1 += x2 * c0;
+ acc2 += x3 * c0;
+ acc3 += x4 * c0;
+ acc4 += x5 * c0;
+ acc5 += x6 * c0;
+ acc6 += x7 * c0;
+ acc7 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x2 * c0;
+ acc1 += x3 * c0;
+ acc2 += x4 * c0;
+ acc3 += x5 * c0;
+ acc4 += x6 * c0;
+ acc5 += x7 * c0;
+ acc6 += x0 * c0;
+ acc7 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x3 * c0;
+ acc1 += x4 * c0;
+ acc2 += x5 * c0;
+ acc3 += x6 * c0;
+ acc4 += x7 * c0;
+ acc5 += x0 * c0;
+ acc6 += x1 * c0;
+ acc7 += x2 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x3 = *(px++);
+ /* Perform the multiply-accumulates */
+ acc0 += x4 * c0;
+ acc1 += x5 * c0;
+ acc2 += x6 * c0;
+ acc3 += x7 * c0;
+ acc4 += x0 * c0;
+ acc5 += x1 * c0;
+ acc6 += x2 * c0;
+ acc7 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x4 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x5 * c0;
+ acc1 += x6 * c0;
+ acc2 += x7 * c0;
+ acc3 += x0 * c0;
+ acc4 += x1 * c0;
+ acc5 += x2 * c0;
+ acc6 += x3 * c0;
+ acc7 += x4 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x5 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x6 * c0;
+ acc1 += x7 * c0;
+ acc2 += x0 * c0;
+ acc3 += x1 * c0;
+ acc4 += x2 * c0;
+ acc5 += x3 * c0;
+ acc6 += x4 * c0;
+ acc7 += x5 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x6 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x7 * c0;
+ acc1 += x0 * c0;
+ acc2 += x1 * c0;
+ acc3 += x2 * c0;
+ acc4 += x3 * c0;
+ acc5 += x4 * c0;
+ acc6 += x5 * c0;
+ acc7 += x6 * c0;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = numTaps % 0x8U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x7 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+ acc4 += x4 * c0;
+ acc5 += x5 * c0;
+ acc6 += x6 * c0;
+ acc7 += x7 * c0;
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+ x3 = x4;
+ x4 = x5;
+ x5 = x6;
+ x6 = x7;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 8 to process the next group of 8 samples */
+ pState = pState + 8;
+
+ /* The results in the 8 accumulators, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+ *pDst++ = acc4;
+ *pDst++ = acc5;
+ *pDst++ = acc6;
+ *pDst++ = acc7;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x8U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc0 += *px++ * *pb++;
+
+ i--;
+ }
+
+ /* Store result in destination buffer. */
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+* @} end of FIR group
+*/
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
new file mode 100644
index 0000000000..87d8a82e47
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q15.c
@@ -0,0 +1,332 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_fast_q15.c
+ * Description: Q15 Fast FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 FIR filter (fast version).
+ @param[in] S points to an instance of the Q15 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This fast version uses a 32-bit accumulator with 2.30 format.
+ The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+
+ @remark
+ Refer to \ref arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_init_q15() to initialize the filter structure.
+ */
+
+void arm_fir_fast_q15(
+ const arm_fir_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
+ px = pState;
+
+ /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
+ x0 = read_q15x2_ia (&px);
+
+ /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
+ x2 = read_q15x2_ia (&px);
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* pack x[n-N-1] and x[n-N-2] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read state x[n-N-4], x[n-N-5] */
+ x0 = read_q15x2_ia (&px);
+
+ /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack x[n-N-3] and x[n-N-4] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ /* Read coefficients b[N-2], b[N-3] */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+ acc0 = __SMLAD(x2, c0, acc0);
+
+ /* Read state x[n-N-6], x[n-N-7] with offset */
+ x2 = read_q15x2_ia (&px);
+
+ /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+ acc2 = __SMLAD(x0, c0, acc2);
+
+ /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack x[n-N-5] and x[n-N-6] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ /* Decrement tap count */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+ This is always be 2 taps since the filter length is even. */
+ if ((numTaps & 0x3U) != 0U)
+ {
+ /* Read last two coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read last state variables */
+ x0 = read_q15x2 (px);
+
+ /* Perform the multiply-accumulates */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* Perform the multiply-accumulates */
+ acc3 = __SMLADX(x1, c0, acc3);
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ Then store the 4 outputs in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy two samples into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Use SIMD to hold states and coefficients */
+ px = pState;
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 1U;
+
+ do
+ {
+ acc0 += (q31_t) *px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
+
+ tapCnt--;
+ }
+ while (tapCnt > 0U);
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
new file mode 100644
index 0000000000..78b6bd8cd9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_fast_q31.c
@@ -0,0 +1,324 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_fast_q31.c
+ * Description: Processing function for the Q31 Fast FIR filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 FIR filter (fast version).
+ @param[in] S points to an instance of the Q31 structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ These intermediate results are added to a 2.30 accumulator.
+ Finally, the accumulator is saturated and converted to a 1.31 result.
+ The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+
+ @remark
+ Refer to \ref arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
+ Use function \ref arm_fir_init_q31() to initialize the filter structure.
+ */
+
+IAR_ONLY_LOW_OPTIMIZATION_ENTER
+void arm_fir_fast_q31(
+ const arm_fir_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first 3 samples from the state buffer:
+ * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *pb;
+
+ /* Read x[n-numTaps-3] sample */
+ x3 = *px;
+
+ /* acc0 += b[numTaps] * x[n-numTaps] */
+ multAcc_32x32_keep32_R(acc0, x0, c0);
+
+ /* acc1 += b[numTaps] * x[n-numTaps-1] */
+ multAcc_32x32_keep32_R(acc1, x1, c0);
+
+ /* acc2 += b[numTaps] * x[n-numTaps-2] */
+ multAcc_32x32_keep32_R(acc2, x2, c0);
+
+ /* acc3 += b[numTaps] * x[n-numTaps-3] */
+ multAcc_32x32_keep32_R(acc3, x3, c0);
+
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb + 1U);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulates */
+ multAcc_32x32_keep32_R(acc0, x1, c0);
+ multAcc_32x32_keep32_R(acc1, x2, c0);
+ multAcc_32x32_keep32_R(acc2, x3, c0);
+ multAcc_32x32_keep32_R(acc3, x0, c0);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb + 2U);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulates */
+ multAcc_32x32_keep32_R(acc0, x2, c0);
+ multAcc_32x32_keep32_R(acc1, x3, c0);
+ multAcc_32x32_keep32_R(acc2, x0, c0);
+ multAcc_32x32_keep32_R(acc3, x1, c0);
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *(pb + 3U);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px + 3U);
+
+ /* Perform the multiply-accumulates */
+ multAcc_32x32_keep32_R(acc0, x3, c0);
+ multAcc_32x32_keep32_R(acc1, x0, c0);
+ multAcc_32x32_keep32_R(acc2, x1, c0);
+ multAcc_32x32_keep32_R(acc3, x2, c0);
+
+ /* update coefficient pointer */
+ pb += 4U;
+ px += 4U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ multAcc_32x32_keep32_R(acc0, x0, c0);
+ multAcc_32x32_keep32_R(acc1, x1, c0);
+ multAcc_32x32_keep32_R(acc2, x2, c0);
+ multAcc_32x32_keep32_R(acc3, x3, c0);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
+ Then store the 4 outputs in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+ *pDst++ = (q31_t) (acc1 << 1);
+ *pDst++ = (q31_t) (acc2 << 1);
+ *pDst++ = (q31_t) (acc3 << 1);
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ multAcc_32x32_keep32_R(acc0, (*px++), (*pb++));
+ i--;
+ } while (i > 0U);
+
+ /* The result is in 2.30 format. Convert to 1.31
+ Then store the output in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+IAR_ONLY_LOW_OPTIMIZATION_EXIT
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c
new file mode 100644
index 0000000000..e1ac1bbe11
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f16.c
@@ -0,0 +1,105 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_init_f16.c
+ * Description: Floating-point FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point FIR filter.
+ @param[in,out] S points to an instance of the floating-point FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed per call
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_f16().
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+ then 4a coefficients are defined in the FIR. The additional coefficients
+ (4a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first 8*ceil(blockSize/8) samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ So the state buffer has size numTaps + 8*ceil(blockSize/8) + blockSize - 1
+
+ */
+
+void arm_fir_init_f16(
+ arm_fir_instance_f16 * S,
+ uint16_t numTaps,
+ const float16_t * pCoeffs,
+ float16_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+ memset(pState, 0, (numTaps + (blockSize - 1U) + ROUND_UP(blockSize, 8)) * sizeof(float16_t));
+#else
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float16_t));
+#endif
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
new file mode 100644
index 0000000000..4d4702837f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -0,0 +1,99 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_init_f32.c
+ * Description: Floating-point FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point FIR filter.
+ @param[in,out] S points to an instance of the floating-point FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed per call
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables and some working memory for the Helium version.
+ pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_f32().
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+ then 4a coefficients are defined in the FIR. The additional coefficients
+ (4a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first blockSize samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ So the state buffer has size numTaps + 2 * blockSize - 1
+
+ */
+
+void arm_fir_init_f32(
+ arm_fir_instance_f32 * S,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+ memset(pState, 0, (numTaps + (blockSize - 1U) + blockSize) * sizeof(float32_t));
+#else
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
+#endif
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
new file mode 100644
index 0000000000..d50c6364ad
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q15.c
@@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_init_q15.c
+ * Description: Q15 FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 FIR filter.
+ @param[in,out] S points to an instance of the Q15 FIR filter structure.
+ @param[in] numTaps number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+ @param[in] pCoeffs points to the filter coefficients buffer.
+ @param[in] pState points to the state buffer.
+ @param[in] blockSize number of samples processed per call.
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : numTaps is not greater than or equal to 4 and even
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ Note that numTaps must be even and greater than or equal to 4.
+ To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero.
+ For example, to implement a filter with numTaps=3 and coefficients
+
+ {0.3, -0.8, 0.3}
+
+ set numTaps=4 and use the coefficients:
+
+ {0.3, -0.8, 0.3, 0}.
+
+ Similarly, to implement a two point filter
+
+ {0.3, -0.3}
+
+ set numTaps=4 and use the coefficients:
+
+ {0.3, -0.3, 0, 0}.
+
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15().
+
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 8 (8a) even if less
+ then 8a coefficients are defined in the FIR. The additional coefficients
+ (8a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+ */
+
+arm_status arm_fir_init_q15(
+ arm_fir_instance_q15 * S,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+#if defined (ARM_MATH_DSP)
+
+ /* The Number of filter coefficients in the filter must be even and at least 4 */
+ if (numTaps & 0x1U)
+ {
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps ) */
+ memset(pState, 0, (numTaps + (blockSize)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+#else
+
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+
+ return (status);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
new file mode 100644
index 0000000000..07123f9b84
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -0,0 +1,100 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_init_q31.c
+ * Description: Q31 FIR filter initialization function.
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 FIR filter.
+ @param[in,out] S points to an instance of the Q31 FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 samples (except for Helium - see below), where blockSize is the number of input samples processed by each call to arm_fir_q31().
+
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 4 (4a) even if less
+ then 4a coefficients are defined in the FIR. The additional coefficients
+ (4a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+ @par Helium state buffer
+ The state buffer must contain some additional temporary data
+ used during the computation but which is not the state of the FIR.
+ The first 2*4*ceil(blockSize/4) samples are temporary data.
+ The remaining samples are the state of the FIR filter.
+ So the state buffer has size numTaps + 8*ceil(blockSize/4) + blockSize - 1
+
+ */
+
+void arm_fir_init_q31(
+ arm_fir_instance_q31 * S,
+ uint16_t numTaps,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ memset(pState, 0, (numTaps + (blockSize - 1U) + 2*ROUND_UP(blockSize, 4)) * sizeof(q31_t));
+ #else
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+ #endif
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c
new file mode 100644
index 0000000000..056a1061e1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_init_q7.c
@@ -0,0 +1,90 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_init_q7.c
+ * Description: Q7 FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q7 FIR filter.
+ @param[in,out] S points to an instance of the Q7 FIR filter structure
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficients buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of samples processed
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to the array of state variables.
+ pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7().
+
+ @par Initialization of Helium version
+ For Helium version the array of coefficients must be a multiple of 16 (16a) even if less
+ then 16a coefficients are defined in the FIR. The additional coefficients
+ (16a - numTaps) must be set to 0.
+ numTaps is still set to its right value in the init function. It means that
+ the implementation may require to read more coefficients due to the vectorization and
+ to avoid having to manage too many different cases in the code.
+
+ */
+
+void arm_fir_init_q7(
+ arm_fir_instance_q7 * S,
+ uint16_t numTaps,
+ const q7_t * pCoeffs,
+ q7_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q7_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
new file mode 100644
index 0000000000..05112c8ea1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -0,0 +1,1252 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_f32.c
+ * Description: Floating-point FIR interpolation sequences
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
+
+ These functions combine an upsampler (zero stuffer) and an FIR filter.
+ They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
+ Conceptually, the functions are equivalent to the block diagram below:
+ \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
+ After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized
+ cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum.
+ The user of the function is responsible for providing the filter coefficients.
+
+ The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
+ The upsampler inserts L-1 zeros between each sample.
+ Instead of multiplying by these zero values, the FIR filter is designed to skip them.
+ This leads to an efficient implementation without any wasted effort.
+ The functions operate on blocks of input and output data.
+ pSrc points to an array of blockSize input values and
+ pDst points to an array of blockSize*L output values.
+
+ The library provides separate functions for Q15, Q31, and floating-point data types.
+
+ @par Algorithm
+ The functions use a polyphase filter structure:
+
+ y[n] = b[0] * x[n] + b[L] * x[n-1] + ... + b[L*(phaseLength-1)] * x[n-phaseLength+1]
+ y[n+1] = b[1] * x[n] + b[L+1] * x[n-1] + ... + b[L*(phaseLength-1)+1] * x[n-phaseLength+1]
+ ...
+ y[n+(L-1)] = b[L-1] * x[n] + b[2*L-1] * x[n-1] + ....+ b[L*(phaseLength-1)+(L-1)] * x[n-phaseLength+1]
+
+ This approach is more efficient than straightforward upsample-then-filter algorithms.
+ With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter.
+ @par
+ pCoeffs points to a coefficient array of size numTaps.
+ numTaps must be a multiple of the interpolation factor L and this is checked by the
+ initialization functions.
+ Internally, the function divides the FIR filter's impulse response into shorter filters of length
+ phaseLength=numTaps/L.
+ Coefficients are stored in time reversed order.
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to a state array of size blockSize + phaseLength - 1.
+ Samples in the state buffer are stored in the order:
+
+ {x[n-phaseLength+1], x[n-phaseLength], x[n-phaseLength-1], x[n-phaseLength-2]....x[0], x[1], ..., x[blockSize-1]}
+
+ @par
+ The state variables are updated after each block of data is processed, the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ L (interpolation factor), pCoeffs, phaseLength (numTaps / L), pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
+ arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
+ arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
+
+ @par
+ where L is the interpolation factor; phaseLength=numTaps/L is the
+ length of each of the shorter FIR filters used internally,
+ pCoeffs is the address of the coefficient buffer;
+ pState is the address of the state buffer.
+ Be sure to set the values in the state buffer to zeros when doing static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point FIR interpolator.
+ @param[in] S points to an instance of the floating-point FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+static void arm_fir_interpolate2_f32_mve(
+ const arm_fir_interpolate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ const float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ uint32_t tapCnt;
+ uint32_t blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t strides[4] = { 0, 1 * 2, 2 * 2, 3 * 2 };
+ uint32x4_t vec_strides0 = vld1q_u32(strides);
+ uint32x4_t vec_strides1 = vec_strides0 + 1;
+ f32x4_t acc0, acc1;
+
+ /*
+ * S->pState buffer contains previous frame (phaseLen - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + (phaseLen - 1U);
+ /*
+ * Total number of intput samples
+ */
+ blkCnt = blockSize;
+ /*
+ * Loop over the blockSize.
+ */
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy new input sample into the state buffer
+ */
+ *pStateCurnt++ = *pSrc++;
+ /*
+ * Initialize state pointer
+ */
+ ptr1 = pState;
+
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs;
+
+ tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ f32x4_t vecCoef, vecState;
+
+ vecState = vldrwq_f32(ptr1);
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides1);
+ acc1 = vfmaq_f32(acc1, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides0);
+ acc0 = vfmaq_f32(acc0, vecState, vecCoef);
+
+ ptr2 += 4 * 2;
+ ptr1 += 4;
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ f32x4_t vecCoef, vecState;
+
+ vecState = vldrwq_z_f32(ptr1, p0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides1, p0);
+ acc1 = vfmaq_f32(acc1, vecState, vecCoef);
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides0, p0);
+ acc0 = vfmaq_f32(acc0, vecState, vecCoef);
+
+ }
+ *pDst++ = vecAddAcrossF32Mve(acc1);
+ *pDst++ = vecAddAcrossF32Mve(acc0);
+
+ /*
+ * Advance the state pointer by 1
+ * * to process the next group of interpolation factor number samples
+ */
+ pState = pState + 1;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Processing is complete.
+ * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ * ** This prepares the state buffer for the next function call.
+ */
+
+ /*
+ * Points to the start of the state buffer
+ */
+ pStateCurnt = S->pState;
+ blkCnt = (phaseLen - 1U) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pStateCurnt, vldrwq_f32(pState));
+ pState += 4;
+ pStateCurnt += 4;
+ blkCnt--;
+ }
+ blkCnt = (phaseLen - 1U) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
+ }
+}
+
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ const float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ uint32_t tapCnt;
+ uint32_t i, blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
+ uint32_t stridesM[4] = { 4, 3, 2, 1 };
+ uint32x4_t vec_stridesM = vld1q_u32(stridesM);
+ uint32x4_t vec_strides = vld1q_u32(strides);
+ f32x4_t acc;
+
+
+ if ( S->L == 2 ) {
+ arm_fir_interpolate2_f32_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+
+ /*
+ * S->pState buffer contains previous frame (phaseLen - 1) samples
+ */
+ /*
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + (phaseLen - 1U);
+ /*
+ * Total number of intput samples
+ */
+ blkCnt = blockSize;
+ /*
+ * Loop over the blockSize.
+ */
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy new input sample into the state buffer
+ */
+ *pStateCurnt++ = *pSrc++;
+ /*
+ * Loop over the Interpolation factor.
+ */
+ i = S->L;
+ while (i > 0U)
+ {
+ /*
+ * Initialize state pointer
+ */
+ ptr1 = pState;
+ if (i >= 4)
+ {
+ float32_t state0, state1, state2, state3;
+ acc = vdupq_n_f32(0.0f);
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U) - 4;
+ tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ f32x4_t vecCoef;
+ const float32_t *pCoef = ptr2;
+
+ state0 = ptr1[0];
+ state1 = ptr1[1];
+ state2 = ptr1[2];
+ state3 = ptr1[3];
+ ptr1 += 4;
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state0);
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state1);
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state2);
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(pCoef, vec_stridesM);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state3);
+
+ ptr2 = ptr2 + 4 * S->L;
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ f32x4_t vecCoef;
+ const float32_t *pCoef = ptr2;
+
+ state0 = ptr1[0];
+ state1 = ptr1[1];
+ state2 = ptr1[2];
+ state3 = ptr1[3];
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state1);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state2);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(pCoef, vec_stridesM, p0);
+ pCoef += S->L;
+ acc = vfmaq_n_f32(acc, vecCoef, state3);
+ }
+
+ vst1q(pDst, acc);
+ pDst += 4;
+ i -= 4;
+ }
+ else
+ {
+ acc = vdupq_n_f32(0.0f);
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U);
+
+ tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ f32x4_t vecCoef, vecState;
+
+ vecState = vldrwq_f32(ptr1);
+ ptr1 += 4;
+
+ vecCoef = vldrwq_gather_shifted_offset_f32(ptr2, vec_strides);
+ ptr2 += 4 * S->L;
+ acc = vfmaq_f32(acc, vecState, vecCoef);
+ /*
+ * Decrement the loop counter
+ */
+ tapCnt--;
+ }
+
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+ f32x4_t vecCoef, vecState;
+
+ vecState = vldrwq_z_f32(ptr1, p0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_f32(ptr2, vec_strides, p0);
+ acc = vfmaq_f32(acc, vecState, vecCoef);
+ }
+ *pDst++ = vecAddAcrossF32Mve(acc);
+ /*
+ * Decrement the loop counter
+ */
+ i--;
+ }
+ }
+
+ /*
+ * Advance the state pointer by 1
+ * * to process the next group of interpolation factor number samples
+ */
+ pState = pState + 1;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Processing is complete.
+ * ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ * ** This prepares the state buffer for the next function call.
+ */
+
+ /*
+ * Points to the start of the state buffer
+ */
+ pStateCurnt = S->pState;
+ blkCnt = (phaseLen - 1U) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pStateCurnt, vldrwq_f32(pState));
+ pState += 4;
+ pStateCurnt += 4;
+ blkCnt--;
+ }
+ blkCnt = (phaseLen - 1U) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_f32(pStateCurnt, vldrwq_f32(pState), p0);
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *ptr1; /* Temporary pointers for state buffer */
+ const float32_t *ptr2; /* Temporary pointers for coefficient buffer */
+ float32_t sum0; /* Accumulators */
+ float32_t c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt, j; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+ uint32_t blkCntN4;
+ float32_t c1, c2, c3;
+
+ float32x4_t sum0v;
+ float32x4_t accV0,accV1;
+ float32x4_t x0v,x1v,x2v,xa,xb;
+ float32x2_t tempV;
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (phaseLen - 1U);
+
+ /* Initialise blkCnt */
+ blkCnt = blockSize >> 3;
+ blkCntN4 = blockSize & 7;
+
+ /* Loop unrolling */
+ while (blkCnt > 0U)
+ {
+ /* Copy new input samples into the state buffer */
+ sum0v = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,sum0v);
+ pSrc += 4;
+ pStateCurnt += 4;
+
+ sum0v = vld1q_f32(pSrc);
+ vst1q_f32(pStateCurnt,sum0v);
+ pSrc += 4;
+ pStateCurnt += 4;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ accV0 = vdupq_n_f32(0.0);
+ accV1 = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ x0v = vld1q_f32(ptr1);
+ x1v = vld1q_f32(ptr1 + 4);
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input samples */
+ x2v = vld1q_f32(ptr1 + 8);
+
+ /* Read the coefficients */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c1 = *(ptr2 + S->L);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c1);
+ accV1 = vmlaq_n_f32(accV1,xb,c1);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c2 = *(ptr2 + S->L * 2);
+
+ xa = vextq_f32(x0v,x1v,2);
+ xb = vextq_f32(x1v,x2v,2);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c2);
+ accV1 = vmlaq_n_f32(accV1,xb,c2);
+
+ /* Read the coefficients, inputs and perform multiply-accumulate */
+ c3 = *(ptr2 + S->L * 3);
+
+ xa = vextq_f32(x0v,x1v,3);
+ xb = vextq_f32(x1v,x2v,3);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c3);
+ accV1 = vmlaq_n_f32(accV1,xb,c3);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+ ptr1 += 4;
+ x0v = x1v;
+ x1v = x2v;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ x2v = vld1q_f32(ptr1 + 8);
+
+ switch (tapCnt)
+ {
+ case 3:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,2);
+ xb = vextq_f32(x1v,x2v,2);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ break;
+
+ case 2:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ c0 = *(ptr2);
+
+ xa = vextq_f32(x0v,x1v,1);
+ xb = vextq_f32(x1v,x2v,1);
+
+ accV0 = vmlaq_n_f32(accV0,xa,c0);
+ accV1 = vmlaq_n_f32(accV1,xb,c0);
+ ptr2 += S->L;
+
+ break;
+
+ case 1:
+ c0 = *(ptr2);
+ accV0 = vmlaq_n_f32(accV0,x0v,c0);
+ accV1 = vmlaq_n_f32(accV1,x1v,c0);
+ ptr2 += S->L;
+
+ break;
+
+ default:
+ break;
+
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst = vgetq_lane_f32(accV0, 0);
+ *(pDst + S->L) = vgetq_lane_f32(accV0, 1);
+ *(pDst + 2 * S->L) = vgetq_lane_f32(accV0, 2);
+ *(pDst + 3 * S->L) = vgetq_lane_f32(accV0, 3);
+
+ *(pDst + 4 * S->L) = vgetq_lane_f32(accV1, 0);
+ *(pDst + 5 * S->L) = vgetq_lane_f32(accV1, 1);
+ *(pDst + 6 * S->L) = vgetq_lane_f32(accV1, 2);
+ *(pDst + 7 * S->L) = vgetq_lane_f32(accV1, 3);
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 8;
+
+ pDst += S->L * 7;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ while (blkCntN4 > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0v = vdupq_n_f32(0.0);
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the coefficient */
+ x1v = vsetq_lane_f32(*(ptr2),x1v,0);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0v = vld1q_f32(ptr1);
+ ptr1 += 4;
+
+ /* Read the coefficient */
+ x1v = vsetq_lane_f32(*(ptr2),x1v,1);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the coefficient */
+ x1v = vsetq_lane_f32(*(ptr2),x1v,2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the coefficient */
+ x1v = vsetq_lane_f32(*(ptr2),x1v,3);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ sum0v = vmlaq_f32(sum0v,x0v,x1v);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tempV = vpadd_f32(vget_low_f32(sum0v),vget_high_f32(sum0v));
+ sum0 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *(ptr1++) * (*ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (phaseLen - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ sum0v = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,sum0v);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (phaseLen - 1U) % 0x04U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+#else
+
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *ptr1; /* Temporary pointer for state buffer */
+ const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
+ float32_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t acc0, acc1, acc2, acc3;
+ float32_t x0, x1, x2, x3;
+ float32_t c0, c1, c2, c3;
+#endif
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the coefficient */
+ c1 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c1;
+ acc1 += x2 * c1;
+ acc2 += x3 * c1;
+ acc3 += x0 * c1;
+
+ /* Read the coefficient */
+ c2 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x2 * c2;
+ acc1 += x3 * c2;
+ acc2 += x0 * c2;
+ acc3 += x1 * c2;
+
+ /* Read the coefficient */
+ c3 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x2 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x3 * c3;
+ acc1 += x0 * c3;
+ acc2 += x1 * c3;
+ acc3 += x2 * c3;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *(pDst ) = acc0;
+ *(pDst + S->L) = acc1;
+ *(pDst + 2 * S->L) = acc2;
+ *(pDst + 3 * S->L) = acc3;
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 4;
+
+ pDst += S->L * 3;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = phaseLen >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ sum0 += *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = phaseLen % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = (phaseLen - 1U) >> 2U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = (phaseLen - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCur; /* Points to the current sample of the state */
+ float32_t *ptr1; /* Temporary pointer for state buffer */
+ const float32_t *ptr2; /* Temporary pointer for coefficient buffer */
+ float32_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1U);
+
+ /* Loop over the polyPhase length */
+ tapCnt = phaseLen;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *ptr1++ * *ptr2;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ tapCnt = phaseLen - 1U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
new file mode 100644
index 0000000000..67bdff9c0a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
@@ -0,0 +1,106 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_init_f32.c
+ * Description: Floating-point FIR interpolator initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point FIR interpolator.
+ @param[in,out] S points to an instance of the floating-point FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+
+ @par
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32().
+ */
+
+arm_status arm_fir_interpolate_init_f32(
+ arm_fir_interpolate_instance_f32 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if ((numTaps % L) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+}
+
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
new file mode 100644
index 0000000000..7436aa434b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
@@ -0,0 +1,106 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_init_q15.c
+ * Description: Q15 FIR interpolator initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 FIR interpolator.
+ @param[in,out] S points to an instance of the Q15 FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15().
+ */
+
+arm_status arm_fir_interpolate_init_q15(
+ arm_fir_interpolate_instance_q15 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if ((numTaps % L) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+}
+
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
new file mode 100644
index 0000000000..a08221605e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
@@ -0,0 +1,105 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_init_q31.c
+ * Description: Q31 FIR interpolator initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 FIR interpolator.
+ @param[in,out] S points to an instance of the Q31 FIR interpolator structure
+ @param[in] L upsample factor
+ @param[in] numTaps number of filter coefficients in the filter
+ @param[in] pCoeffs points to the filter coefficient buffer
+ @param[in] pState points to the state buffer
+ @param[in] blockSize number of input samples to process per call
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : filter length numTaps is not a multiple of the interpolation factor L
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[numTaps-2], ..., b[1], b[0]}
+
+ The length of the filter numTaps must be a multiple of the interpolation factor L.
+ @par
+ pState points to the array of state variables.
+ pState is of length (numTaps/L)+blockSize-1 words
+ where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31().
+ */
+
+arm_status arm_fir_interpolate_init_q31(
+ arm_fir_interpolate_instance_q31 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if ((numTaps % L) != 0U)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0, (blockSize + ((uint32_t) S->phaseLength - 1U)) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+}
+
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
new file mode 100644
index 0000000000..523e1557fe
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q15.c
@@ -0,0 +1,774 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_q15.c
+ * Description: Q15 FIR interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 FIR interpolator.
+ @param[in] S points to an instance of the Q15 FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_fir_interpolate_q15(
+ const arm_fir_interpolate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ const q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+
+ uint32_t i, blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint16_t strides[8] = {
+ 0, 1 * S->L, 2 * S->L, 3 * S->L,
+ 4 * S->L, 5 * S->L, 6 * S->L, 7 * S->L
+ };
+ uint16x8_t vec_strides0 = *(uint16x8_t *) strides;
+ uint16x8_t vec_strides1 = vec_strides0 + 1;
+ uint16x8_t vec_strides2 = vec_strides0 + 2;
+ uint16x8_t vec_strides3 = vec_strides0 + 3;
+ q15x8_t vecState, vecCoef;
+
+ /*
+ * S->pState buffer contains previous frame (phaseLen - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + ((q15_t) phaseLen - 1);
+ /*
+ * Total number of intput samples
+ */
+ blkCnt = blockSize;
+ /*
+ * Loop over the blockSize.
+ */
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy new input sample into the state buffer
+ */
+ *pStateCurnt++ = *pSrc++;
+ /*
+ * Loop over the Interpolation factor.
+ */
+ i = S->L;
+ while (i > 0U)
+ {
+ /*
+ * Initialize state pointer
+ */
+ ptr1 = pState;
+ if (i >= 4)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1 - 3U);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+ q63_t acc2 = 0LL;
+ q63_t acc3 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 3;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrhq_s16(ptr1);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides3);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+ acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+ acc3 = vmlaldavaq(acc3, vecState, vecCoef);
+
+ ptr1 += 8;
+ ptr2 = ptr2 + S->L * 8;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+
+ vecState = vldrhq_z_s16(ptr1, p0);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides3, p0);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+ acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+ acc3 = vmlaldavaq(acc3, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+ acc3 = asrl(acc3, 15);
+
+ *pDst++ = (q15_t) __SSAT(acc0, 16);
+ *pDst++ = (q15_t) __SSAT(acc1, 16);
+ *pDst++ = (q15_t) __SSAT(acc2, 16);
+ *pDst++ = (q15_t) __SSAT(acc3, 16);
+ i -= 4;
+ }
+ else if (i >= 3)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U - 2);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+ q63_t acc2 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 3;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrhq_s16(ptr1);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides2);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+ acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+
+ ptr1 += 8;
+ ptr2 = ptr2 + S->L * 8;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+
+ vecState = vldrhq_z_s16(ptr1, p0);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides2, p0);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+ acc2 = vmlaldavaq(acc2, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+
+ *pDst++ = (q15_t) __SSAT(acc0, 16);;
+ *pDst++ = (q15_t) __SSAT(acc1, 16);;
+ *pDst++ = (q15_t) __SSAT(acc2, 16);;
+ i -= 3;
+ }
+ else if (i >= 2)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U - 1);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 3;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrhq_s16(ptr1);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides1);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+
+ ptr1 += 8;
+ ptr2 = ptr2 + S->L * 8;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+
+ vecState = vldrhq_z_s16(ptr1, p0);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides1, p0);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+ acc1 = vmlaldavaq(acc1, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+
+ *pDst++ = (q15_t) __SSAT(acc0, 16);
+ *pDst++ = (q15_t) __SSAT(acc1, 16);
+ i -= 2;
+ }
+ else
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U);
+
+ q63_t acc0 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 3;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrhq_s16(ptr1);
+ vecCoef = vldrhq_gather_shifted_offset_s16(ptr2, vec_strides0);
+
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+
+ ptr1 += 8;
+ ptr2 = ptr2 + S->L * 8;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 7;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(tapCnt);
+
+ vecState = vldrhq_z_s16(ptr1, p0);
+ vecCoef = vldrhq_gather_shifted_offset_z_s16(ptr2, vec_strides0, p0);
+ acc0 = vmlaldavaq(acc0, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 15);
+ *pDst++ = (q15_t) __SSAT(acc0, 16);
+ /*
+ * Decrement the loop counter
+ */
+ i--;
+ }
+ }
+ /*
+ * Advance the state pointer by 1
+ * * to process the next group of interpolation factor number samples
+ */
+ pState = pState + 1;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Processing is complete.
+ * Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ * This prepares the state buffer for the next function call.
+ */
+
+ /*
+ * Points to the start of the state buffer
+ */
+ pStateCurnt = S->pState;
+ blkCnt = (phaseLen - 1U) >> 3;
+ while (blkCnt > 0U)
+ {
+ vstrhq_s16(pStateCurnt, vldrhq_s16(pState));
+ pState += 8;
+ pStateCurnt += 8;
+ blkCnt--;
+ }
+ blkCnt = (phaseLen - 1U) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_s16(pStateCurnt, vldrhq_s16(pState), p0);
+ }
+}
+#else
+void arm_fir_interpolate_q15(
+ const arm_fir_interpolate_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *ptr1; /* Temporary pointer for state buffer */
+ const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2, acc3;
+ q15_t x0, x1, x2, x3;
+ q15_t c0, c1, c2, c3;
+#endif
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Read the coefficient */
+ c1 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 * c1;
+ acc1 += (q63_t) x2 * c1;
+ acc2 += (q63_t) x3 * c1;
+ acc3 += (q63_t) x0 * c1;
+
+ /* Read the coefficient */
+ c2 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x2 * c2;
+ acc1 += (q63_t) x3 * c2;
+ acc2 += (q63_t) x0 * c2;
+ acc3 += (q63_t) x1 * c2;
+
+ /* Read the coefficient */
+ c3 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x2 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x3 * c3;
+ acc1 += (q63_t) x0 * c3;
+ acc2 += (q63_t) x1 * c3;
+ acc3 += (q63_t) x2 * c3;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *(pDst ) = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
+ *(pDst + 2 * S->L) = (q15_t) (__SSAT((acc2 >> 15), 16));
+ *(pDst + 3 * S->L) = (q15_t) (__SSAT((acc3 >> 15), 16));
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 4;
+
+ pDst += S->L * 3;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = phaseLen >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = phaseLen % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = (phaseLen - 1U) >> 2U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = (phaseLen - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ q15_t *ptr1; /* Temporary pointer for state buffer */
+ const q15_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1U);
+
+ /* Loop over the polyPhase length */
+ tapCnt = phaseLen;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += ((q63_t) *ptr1++ * *ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Store the result after converting to 1.15 format in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ tapCnt = phaseLen - 1U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI)*/
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
new file mode 100644
index 0000000000..ea217603ec
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_interpolate_q31.c
@@ -0,0 +1,773 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_interpolate_q31.c
+ * Description: Q31 FIR interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Interpolate
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 FIR interpolator.
+ @param[in] S points to an instance of the Q31 FIR interpolator structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L).
+ since numTaps/L additions occur per output sample.
+ After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_fir_interpolate_q31(
+ const arm_fir_interpolate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ const q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+
+ uint32_t i, blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t strides[4] = { 0, 1 * S->L, 2 * S->L, 3 * S->L };
+ uint32x4_t vec_strides0 = vld1q_u32(strides);
+ uint32x4_t vec_strides1 = vec_strides0 + 1;
+ uint32x4_t vec_strides2 = vec_strides0 + 2;
+ uint32x4_t vec_strides3 = vec_strides0 + 3;
+ q31x4_t vecState, vecCoef;
+
+ /*
+ * S->pState buffer contains previous frame (phaseLen - 1) samples
+ * pStateCurnt points to the location where the new input data should be written
+ */
+ pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+ /*
+ * Total number of intput samples
+ */
+ blkCnt = blockSize;
+ /*
+ * Loop over the blockSize.
+ */
+ while (blkCnt > 0U)
+ {
+ /*
+ * Copy new input sample into the state buffer
+ */
+ *pStateCurnt++ = *pSrc++;
+ /*
+ * Loop over the Interpolation factor.
+ */
+ i = S->L;
+ while (i > 0U)
+ {
+ /*
+ * Initialize state pointer
+ */
+ ptr1 = pState;
+ if (i >= 4)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1 - 3U);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+ q63_t acc2 = 0LL;
+ q63_t acc3 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrwq_s32(ptr1);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides3);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides2);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
+ acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
+ acc3 = vrmlaldavhaq(acc3, vecState, vecCoef);
+
+ ptr1 += 4;
+ ptr2 = ptr2 + S->L * 4;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+
+ vecState = vldrwq_z_s32(ptr1, p0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides3, p0);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides2, p0);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
+ acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
+ acc3 = vrmlaldavhaq(acc3, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 31 - 8);
+ acc1 = asrl(acc1, 31 - 8);
+ acc2 = asrl(acc2, 31 - 8);
+ acc3 = asrl(acc3, 31 - 8);
+
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ *pDst++ = (q31_t) acc2;
+ *pDst++ = (q31_t) acc3;
+ i -= 4;
+ }
+ else if (i >= 3)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U - 2);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+ q63_t acc2 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrwq_s32(ptr1);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides2);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
+ acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
+
+ ptr1 += 4;
+ ptr2 = ptr2 + S->L * 4;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+
+ vecState = vldrwq_z_s32(ptr1, p0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides2, p0);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
+ acc2 = vrmlaldavhaq(acc2, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 31 - 8);
+ acc1 = asrl(acc1, 31 - 8);
+ acc2 = asrl(acc2, 31 - 8);
+
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ *pDst++ = (q31_t) acc2;
+ i -= 3;
+ }
+ else if (i >= 2)
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U - 1);
+
+ q63_t acc0 = 0LL;
+ q63_t acc1 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrwq_s32(ptr1);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides1);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+
+ ptr1 += 4;
+ ptr2 = ptr2 + S->L * 4;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+
+ vecState = vldrwq_z_s32(ptr1, p0);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides1, p0);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
+ acc1 = vrmlaldavhaq(acc1, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 31 - 8);
+ acc1 = asrl(acc1, 31 - 8);
+
+ *pDst++ = (q31_t) acc0;
+ *pDst++ = (q31_t) acc1;
+ i -= 2;
+ }
+ else
+ {
+ /*
+ * Initialize coefficient pointer
+ */
+ ptr2 = pCoeffs + (i - 1U);
+
+ q63_t acc0 = 0LL;
+
+ uint32_t tapCnt = phaseLen >> 2;
+ while (tapCnt > 0U)
+ {
+ vecState = vldrwq_s32(ptr1);
+ vecCoef = vldrwq_gather_shifted_offset_s32(ptr2, vec_strides0);
+
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+
+ ptr1 += 4;
+ ptr2 = ptr2 + S->L * 4;
+ tapCnt--;
+ }
+ tapCnt = phaseLen & 3;
+ if (tapCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(tapCnt);
+
+ vecState = vldrwq_z_s32(ptr1, p0);
+ vecCoef = vldrwq_gather_shifted_offset_z_s32(ptr2, vec_strides0, p0);
+ acc0 = vrmlaldavhaq(acc0, vecState, vecCoef);
+ }
+
+ acc0 = asrl(acc0, 31 - 8);
+ *pDst++ = (q31_t) acc0;
+ /*
+ * Decrement the loop counter
+ */
+ i--;
+ }
+ }
+ /*
+ * Advance the state pointer by 1
+ * * to process the next group of interpolation factor number samples
+ */
+ pState = pState + 1;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Processing is complete.
+ * ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ * ** This prepares the state buffer for the next function call.
+ */
+
+ /*
+ * Points to the start of the state buffer
+ */
+ pStateCurnt = S->pState;
+ blkCnt = (phaseLen - 1U) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pStateCurnt, vldrwq_s32(pState));
+ pState += 4;
+ pStateCurnt += 4;
+ blkCnt--;
+ }
+ blkCnt = (phaseLen - 1U) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_s32(pStateCurnt, vldrwq_s32(pState), p0);
+ }
+}
+#else
+void arm_fir_interpolate_q31(
+ const arm_fir_interpolate_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *ptr1; /* Temporary pointer for state buffer */
+ const q31_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t j;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc0, acc1, acc2, acc3;
+ q31_t x0, x1, x2, x3;
+ q31_t c0, c1, c2, c3;
+#endif
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2U;
+
+ x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Read the coefficient */
+ c1 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 * c1;
+ acc1 += (q63_t) x2 * c1;
+ acc2 += (q63_t) x3 * c1;
+ acc3 += (q63_t) x0 * c1;
+
+ /* Read the coefficient */
+ c2 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x2 * c2;
+ acc1 += (q63_t) x3 * c2;
+ acc2 += (q63_t) x0 * c2;
+ acc3 += (q63_t) x1 * c2;
+
+ /* Read the coefficient */
+ c3 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x2 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x3 * c3;
+ acc1 += (q63_t) x0 * c3;
+ acc2 += (q63_t) x1 * c3;
+ acc3 += (q63_t) x2 * c3;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 * c0;
+ acc1 += (q63_t) x1 * c0;
+ acc2 += (q63_t) x2 * c0;
+ acc3 += (q63_t) x3 * c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *(pDst ) = (q31_t) (acc0 >> 31);
+ *(pDst + S->L) = (q31_t) (acc1 >> 31);
+ *(pDst + 2 * S->L) = (q31_t) (acc2 >> 31);
+ *(pDst + 3 * S->L) = (q31_t) (acc3 >> 31);
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 4;
+
+ pDst += S->L * 3;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1U;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length.
+ Repeat until we've computed numTaps-(4*S->L) coefficients. */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = phaseLen >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = phaseLen % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = phaseLen;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) *ptr1++ * *ptr2;
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (sum0 >> 31);
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ tapCnt = (phaseLen - 1U) >> 2U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = (phaseLen - 1U) % 0x04U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (phaseLen - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ q31_t *ptr1; /* Temporary pointer for state buffer */
+ const q31_t *ptr2; /* Temporary pointer for coefficient buffer */
+ q63_t sum0; /* Accumulators */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint32_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCur points to the location where the new input data should be written */
+ pStateCur = S->pState + (phaseLen - 1U);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while (blkCnt > 0U)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCur++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while (i > 0U)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1U);
+
+ /* Loop over the polyPhase length */
+ tapCnt = phaseLen;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += ((q63_t) *ptr1++ * *ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (sum0 >> 31);
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCur = S->pState;
+
+ tapCnt = phaseLen - 1U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+ @} end of FIR_Interpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
new file mode 100644
index 0000000000..77ea75c15b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_f32.c
@@ -0,0 +1,453 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_f32.c
+ * Description: Processing function for floating-point FIR Lattice filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters
+
+ This set of functions implements Finite Impulse Response (FIR) lattice filters
+ for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ variety of adaptive filter applications. The filter structure is feedforward and
+ the net impulse response is finite length.
+ The functions operate on blocks
+ of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst point to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ \image html FIRLattice.gif "Finite Impulse Response Lattice filter"
+ The following difference equation is implemented:
+ @par
+
+ f0[n] = g0[n] = x[n]
+ fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
+ gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
+ y[n] = fM[n]
+
+ @par
+ pCoeffs points to tha array of reflection coefficients of size numStages.
+ Reflection Coefficients are stored in the following order.
+ @par
+
+ {k1, k2, ..., kM}
+
+ where M is number of stages
+ @par
+ pState points to a state array of size numStages.
+ The state variables (g values) hold previous inputs and are stored in the following order.
+
+ {g0[n], g1[n], g2[n] ...gM-1[n]}
+
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+
+ arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};
+ arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};
+ arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};
+
+ @par
+ where numStages is the number of stages in the filter;
+ pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the FIR Lattice filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point FIR lattice filter.
+ @param[in] S points to an instance of the floating-point FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_fir_lattice_f32(
+ const arm_fir_lattice_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* Temporary state pointer */
+ const float32_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ float32_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ float32_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ float32_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
+
+ gcurr0 = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+ fcurr1 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (fcurr0 * (*pk)) + gcurr0;
+
+ /* Process second sample for first tap */
+ fnext1 = (fcurr0 * (*pk)) + fcurr1;
+ gnext1 = (fcurr1 * (*pk)) + fcurr0;
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr2 = *pSrc++;
+ fcurr3 = *pSrc++;
+
+ /* Process third sample for first tap */
+ fnext2 = (fcurr1 * (*pk)) + fcurr2;
+ gnext2 = (fcurr2 * (*pk)) + fcurr1;
+
+ /* Process fourth sample for first tap */
+ fnext3 = (fcurr2 * (*pk )) + fcurr3;
+ gnext3 = (fcurr3 * (*pk++)) + fcurr2;
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1U) >> 2U;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while (stageCnt > 0U)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g2(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr0 = (gcurr0 * (*pk)) + fnext0;
+
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr1 = (gnext0 * (*pk)) + fnext1;
+
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr2 = (gnext1 * (*pk)) + fnext2;
+
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr3 = (gnext2 * (*pk)) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+
+ gnext1 = (fnext1 * (*pk)) + gnext0;
+
+ gnext0 = (fnext0 * (*pk++)) + gcurr0;
+
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g3(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
+ /* Process third sample for 4th, 8th .. tap */
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr0 = (gcurr0 * (*pk)) + fnext0;
+
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr1 = (gnext0 * (*pk)) + fnext1;
+
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr2 = (gnext1 * (*pk)) + fnext2;
+
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr3 = (gnext2 * (*pk)) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+
+ gnext1 = (fnext1 * (*pk)) + gnext0;
+
+ gnext0 = (fnext0 * (*pk++)) + gcurr0;
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1U) % 0x4U;
+
+ while (stageCnt > 0U)
+ {
+ gcurr0 = *px;
+
+ /* save g value in state buffer */
+ *px++ = gnext3;
+
+ /* Process four samples for last three taps here */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ fnext1 = (gnext0 * (*pk)) + fcurr1;
+
+ fnext2 = (gnext1 * (*pk)) + fcurr2;
+
+ fnext3 = (gnext2 * (*pk)) + fcurr3;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+
+ gnext1 = (fcurr1 * (*pk)) + gnext0;
+
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ stageCnt--;
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+ *pDst++ = fcurr1;
+ *pDst++ = fcurr2;
+ *pDst++ = fcurr3;
+
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (gcurr0 * (*pk)) + fcurr0;
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (fcurr0 * (*pk++)) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+
+ blkCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
new file mode 100644
index 0000000000..a3a0c224cd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
@@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_init_f32.c
+ * Description: Floating-point FIR Lattice filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point FIR lattice filter.
+ @param[in] S points to an instance of the floating-point FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
+ */
+
+void arm_fir_lattice_init_f32(
+ arm_fir_lattice_instance_f32 * S,
+ uint16_t numStages,
+ const float32_t * pCoeffs,
+ float32_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
new file mode 100644
index 0000000000..3996d48466
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
@@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_init_q15.c
+ * Description: Q15 FIR Lattice filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 FIR lattice filter.
+ @param[in] S points to an instance of the Q15 FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
+ */
+
+void arm_fir_lattice_init_q15(
+ arm_fir_lattice_instance_q15 * S,
+ uint16_t numStages,
+ const q15_t * pCoeffs,
+ q15_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
new file mode 100644
index 0000000000..4a91b59043
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
@@ -0,0 +1,70 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_init_q31.c
+ * Description: Q31 FIR lattice filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 FIR lattice filter.
+ @param[in] S points to an instance of the Q31 FIR lattice structure
+ @param[in] numStages number of filter stages
+ @param[in] pCoeffs points to the coefficient buffer. The array is of length numStages
+ @param[in] pState points to the state buffer. The array is of length numStages
+ @return none
+ */
+
+void arm_fir_lattice_init_q31(
+ arm_fir_lattice_instance_q31 * S,
+ uint16_t numStages,
+ const q31_t * pCoeffs,
+ q31_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
new file mode 100644
index 0000000000..14b7076d6c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q15.c
@@ -0,0 +1,506 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_q15.c
+ * Description: Q15 FIR lattice filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for Q15 FIR lattice filter.
+ @param[in] S points to an instance of the Q15 FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_fir_lattice_q15(
+ const arm_fir_lattice_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* Temporary state pointer */
+ const q15_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ q31_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ q31_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ q31_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
+
+ gcurr0 = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+ fcurr1 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) ((fcurr0 * (*pk)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* Process second sample for first tap */
+ fnext1 = (q31_t) ((fcurr0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + fcurr0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr2 = *pSrc++;
+ fcurr3 = *pSrc++;
+
+ /* Process third sample for first tap */
+ fnext2 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + fcurr1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ /* Process fourth sample for first tap */
+ fnext3 = (q31_t) ((fcurr2 * (*pk )) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
+ gnext3 = (q31_t) ((fcurr3 * (*pk++)) >> 15U) + fcurr2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = (q15_t) fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1U) >> 2U;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while (stageCnt > 0U)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext3;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext3;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fnext0;
+ fcurr0 = __SSAT(fcurr0, 16);
+
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fnext1;
+ fcurr1 = __SSAT(fcurr1, 16);
+
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
+ fcurr2 = __SSAT(fcurr2, 16);
+
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
+ fcurr3 = __SSAT(fcurr3, 16);
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fnext1 * (*pk)) >> 15U) + gnext0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ gnext0 = (q31_t) ((fnext0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext3;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ /* Process third sample for 4th, 8th .. tap */
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = (q15_t) gnext3;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fnext0;
+ fcurr0 = __SSAT(fcurr0, 16);
+
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fnext1;
+ fcurr1 = __SSAT(fcurr1, 16);
+
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fnext2;
+ fcurr2 = __SSAT(fcurr2, 16);
+
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fnext3;
+ fcurr3 = __SSAT(fcurr3, 16);
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fnext1 * (*pk)) >> 15U) + gnext0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ gnext0 = (q31_t) ((fnext0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1U) % 0x4U;
+
+ while (stageCnt > 0U)
+ {
+ gcurr0 = *px;
+
+ /* save g value in state buffer */
+ *px++ = (q15_t) gnext3;
+
+ /* Process four samples for last three taps here */
+ fnext0 = (q31_t) ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ fnext1 = (q31_t) ((gnext0 * (*pk)) >> 15U) + fcurr1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15U) + fcurr2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15U) + fcurr3;
+ fnext3 = __SSAT(fnext3, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext3 = (q31_t) ((fcurr3 * (*pk)) >> 15U) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fcurr2 * (*pk)) >> 15U) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fcurr1 * (*pk)) >> 15U) + gnext0;
+ gnext1 = __SSAT(gnext1, 16);
+
+ gnext0 = (q31_t) ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ stageCnt--;
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pDst, __PKHBT(fcurr0, fcurr1, 16));
+ write_q15x2_ia (&pDst, __PKHBT(fcurr2, fcurr3, 16));
+#else
+ write_q15x2_ia (&pDst, __PKHBT(fcurr1, fcurr0, 16));
+ write_q15x2_ia (&pDst, __PKHBT(fcurr3, fcurr2, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (((q31_t) gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (((q31_t) fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) fcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (((q31_t) gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (((q31_t) fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = __SSAT(fcurr0, 16);
+
+ blkCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g0(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* save f0(n) in state buffer */
+ *px++ = (q15_t) fcurr0;
+
+ /* f1(n) is saved in fcurr for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g1(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* save g0(n-1) in state buffer */
+ *px++ = (q15_t) gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = ((gcurr0 * (*pk)) >> 15U) + fcurr0;
+ fnext0 = __SSAT(fnext0, 16);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = ((fcurr0 * (*pk++)) >> 15U) + gcurr0;
+ gnext0 = __SSAT(gnext0, 16);
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = __SSAT(fcurr0, 16);
+
+ blkCnt--;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
new file mode 100644
index 0000000000..3b8355185a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_lattice_q31.c
@@ -0,0 +1,505 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_lattice_q31.c
+ * Description: Q31 FIR lattice filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 FIR lattice filter.
+ @param[in] S points to an instance of the Q31 FIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits.
+ */
+
+void arm_fir_lattice_q31(
+ const arm_fir_lattice_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* Temporary state pointer */
+ const q31_t *pk; /* Temporary coefficient pointer */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* Loop counters */
+ q31_t fcurr0, fnext0, gnext0, gcurr0; /* Temporary variables */
+
+#if (1)
+//#if !defined(ARM_MATH_CM0_FAMILY)
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t fcurr1, fnext1, gnext1; /* Temporary variables for second sample in loop unrolling */
+ q31_t fcurr2, fnext2, gnext2; /* Temporary variables for third sample in loop unrolling */
+ q31_t fcurr3, fnext3, gnext3; /* Temporary variables for fourth sample in loop unrolling */
+#endif
+
+ gcurr0 = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+ fcurr1 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Read g0(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Process second sample for first tap */
+ fnext1 = (q31_t) (((q63_t) fcurr0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + fcurr0;
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr2 = *pSrc++;
+ fcurr3 = *pSrc++;
+
+ /* Process third sample for first tap */
+ fnext2 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + fcurr1;
+
+ /* Process fourth sample for first tap */
+ fnext3 = (q31_t) (((q63_t) fcurr2 * (*pk )) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk++)) >> 32U);
+ gnext3 = (gnext3 << 1U) + fcurr2;
+
+ /* Copy only last input sample into the state buffer
+ which will be used for next samples processing */
+ *px++ = fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1U) >> 2U;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while (stageCnt > 0U)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fcurr0 = (fcurr0 << 1U) + fnext0;
+
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fcurr1 = (fcurr1 << 1U) + fnext1;
+
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fcurr2 = (fcurr2 << 1U) + fnext2;
+
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fcurr3 = (fcurr3 << 1U) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext3 = (q31_t) (((q63_t) fnext3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fnext2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fnext1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fnext0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+
+ /* Process third sample for 4th, 8th .. tap */
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr0 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = gnext3;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fcurr0 = (fcurr0 << 1U) + fnext0;
+
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fcurr1 = (fcurr1 << 1U) + fnext1;
+
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fcurr2 = (fcurr2 << 1U) + fnext2;
+
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fcurr3 = (fcurr3 << 1U) + fnext3;
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext3 = (q31_t) (((q63_t) fnext3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fnext2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fnext1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fnext0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1U) % 0x4U;
+
+ while (stageCnt > 0U)
+ {
+ gcurr0 = *px;
+
+ /* save g value in state buffer */
+ *px++ = gnext3;
+
+ /* Process four samples for last three taps here */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ fnext1 = (q31_t) (((q63_t) gnext0 * (*pk)) >> 32U);
+ fnext1 = (fnext1 << 1U) + fcurr1;
+
+ fnext2 = (q31_t) (((q63_t) gnext1 * (*pk)) >> 32U);
+ fnext2 = (fnext2 << 1U) + fcurr2;
+
+ fnext3 = (q31_t) (((q63_t) gnext2 * (*pk)) >> 32U);
+ fnext3 = (fnext3 << 1U) + fcurr3;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext3 = (q31_t) (((q63_t) fcurr3 * (*pk)) >> 32U);
+ gnext3 = (gnext3 << 1U) + gnext2;
+
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (*pk)) >> 32U);
+ gnext2 = (gnext2 << 1U) + gnext1;
+
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (*pk)) >> 32U);
+ gnext1 = (gnext1 << 1U) + gnext0;
+
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr0 = fnext0;
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+
+ stageCnt--;
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+ *pDst++ = fcurr1;
+ *pDst++ = fcurr2;
+ *pDst++ = fcurr3;
+
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g2(n) from state buffer */
+ gcurr0 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+
+ blkCnt--;
+ }
+
+#else
+/* alternate version for CM0_FAMILY */
+
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* f0(n) = x(n) */
+ fcurr0 = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* read g0(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext << 1U) + fcurr0;
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* save f0(n) in state buffer */
+ *px++ = fcurr0;
+
+ /* f1(n) is saved in fcurr for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt = (numStages - 1U);
+
+ /* stage loop */
+ while (stageCnt > 0U)
+ {
+ /* read g1(n-1) from state buffer */
+ gcurr0 = *px;
+
+ /* save g0(n-1) in state buffer */
+ *px++ = gnext0;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext0 = (q31_t) (((q63_t) gcurr0 * (*pk)) >> 32U);
+ fnext0 = (fnext0 << 1U) + fcurr0;
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext0 = (q31_t) (((q63_t) fcurr0 * (*pk++)) >> 32U);
+ gnext0 = (gnext0 << 1U) + gcurr0;
+
+ /* f1(n) is saved in fcurr0 for next stage processing */
+ fcurr0 = fnext0;
+
+ stageCnt--;
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr0;
+
+ blkCnt--;
+ }
+
+#endif /* #if !defined(ARM_MATH_CM0_FAMILY) */
+
+}
+
+/**
+ @} end of FIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c
new file mode 100644
index 0000000000..2251ff6d5e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q15.c
@@ -0,0 +1,732 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_q15.c
+ * Description: Q15 FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 FIR filter.
+ @param[in] S points to an instance of the Q15 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+
+ @remark
+ Refer to \ref arm_fir_fast_q15() for a faster but less precise implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+
+#define FIR_Q15_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q15_t *pSmp = &pSample[j]; \
+ q63_t acc[4]; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 8 * i); \
+ acc[j] = vmlaldavaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc[j], 15); \
+ }
+
+#define FIR_Q15_MAIN_CORE() \
+{ \
+ q15_t *pState = S->pState; /* State pointer */ \
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
+ q15_t *pStateCur; /* Points to the current sample of the state */ \
+ const q15_t *pSamples; /* Temporary pointer to the sample buffer */ \
+ q15_t *pOutput; /* Temporary pointer to the output buffer */ \
+ const q15_t *pTempSrc; /* Temporary pointer to the source data */ \
+ q15_t *pTempDest; /* Temporary pointer to the destination buffer */\
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
+ int32_t blkCnt; \
+ q15x8_t vecIn0; \
+ \
+ /* \
+ * load coefs \
+ */ \
+ q15x8_t vecCoeffs[NBVECTAPS]; \
+ \
+ for (int i = 0; i < NBVECTAPS; i++) \
+ vecCoeffs[i] = vldrhq_s16(pCoeffs + 8 * i); \
+ \
+ /* \
+ * pState points to state array which contains previous frame (numTaps - 1) samples \
+ * pStateCur points to the location where the new input data should be written \
+ */ \
+ pStateCur = &(pState[(numTaps - 1u)]); \
+ pTempSrc = pSrc; \
+ pSamples = pState; \
+ pOutput = pDst; \
+ \
+ blkCnt = blockSize >> 2; \
+ while (blkCnt > 0) { \
+ /* \
+ * Save 4 input samples in the history buffer \
+ */ \
+ vstrhq_s32(pStateCur, vldrhq_s32(pTempSrc)); \
+ pStateCur += 4; \
+ pTempSrc += 4; \
+ \
+ FIR_Q15_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
+ pSamples += 4; \
+ \
+ blkCnt--; \
+ } \
+ \
+ /* tail */ \
+ int32_t residual = blockSize & 3; \
+ \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q15_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
+ \
+ /* \
+ * Copy the samples back into the history buffer start \
+ */ \
+ pTempSrc = &pState[blockSize]; \
+ pTempDest = pState; \
+ \
+ /* current compiler limitation */ \
+ blkCnt = (numTaps - 1) >> 3; \
+ while (blkCnt > 0) \
+ { \
+ vstrhq_s16(pTempDest, vldrhq_s16(pTempSrc)); \
+ pTempSrc += 8; \
+ pTempDest += 8; \
+ blkCnt--; \
+ } \
+ blkCnt = (numTaps - 1) & 7; \
+ if (blkCnt > 0) \
+ { \
+ mve_pred16_t p = vctp16q(blkCnt); \
+ vstrhq_p_s16(pTempDest, vldrhq_z_s16(pTempSrc, p), p); \
+ } \
+}
+
+static void arm_fir_q15_25_32_mve(const arm_fir_instance_q15 * S,
+ const q15_t * __restrict pSrc,
+ q15_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 32
+ #define NBVECTAPS (NBTAPS / 8)
+ FIR_Q15_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+static void arm_fir_q15_17_24_mve(const arm_fir_instance_q15 * S,
+ const q15_t * __restrict pSrc,
+ q15_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 24
+ #define NBVECTAPS (NBTAPS / 8)
+ FIR_Q15_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q15_9_16_mve(const arm_fir_instance_q15 * S,
+ const q15_t * __restrict pSrc,
+ q15_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 16
+ #define NBVECTAPS (NBTAPS / 8)
+ FIR_Q15_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+static void arm_fir_q15_1_8_mve(const arm_fir_instance_q15 * S,
+ const q15_t * __restrict pSrc,
+ q15_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 8
+ #define NBVECTAPS (NBTAPS / 8)
+ FIR_Q15_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+void arm_fir_q15(
+ const arm_fir_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCur; /* Points to the current sample of the state */
+ const q15_t *pSamples; /* Temporary pointer to the sample buffer */
+ q15_t *pOutput; /* Temporary pointer to the output buffer */
+ const q15_t *pTempSrc; /* Temporary pointer to the source data */
+ q15_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ q15x8_t vecIn0;
+ uint32_t tapsBlkCnt = (numTaps + 7) / 8;
+ q63_t acc0, acc1, acc2, acc3;
+
+
+int32_t nbTaps = (numTaps + 7) >> 3;
+
+switch(nbTaps) {
+
+ case 1:
+ arm_fir_q15_1_8_mve(S, pSrc, pDst, blockSize);
+ return;
+ case 2:
+ arm_fir_q15_9_16_mve(S, pSrc, pDst, blockSize);
+ return;
+ case 3:
+ arm_fir_q15_17_24_mve(S, pSrc, pDst, blockSize);
+ return;
+ case 4:
+ arm_fir_q15_25_32_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pTempSrc = pSrc;
+ pSamples = pState;
+ pOutput = pDst;
+ blkCnt = blockSize >> 2;
+
+ while (blkCnt > 0U)
+ {
+ const q15_t *pCoeffsTmp = pCoeffs;
+ const q15_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 8 coefs
+ */
+ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[3]);
+ acc3 = vmlaldavaq(acc3, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 8;
+ pCoeffsTmp += 8;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc3, 15);
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ uint32_t residual = blockSize & 3;
+ switch (residual)
+ {
+ case 3:
+ {
+ const q15_t *pCoeffsTmp = pCoeffs;
+ const q15_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ *(q15x8_t *) pStateCur = *(q15x8_t *) pTempSrc;
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 8 coefs
+ */
+ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[4]);
+ acc2 = vmlaldavaq(acc2, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 8;
+ pCoeffsTmp += 8;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc2, 15);
+ }
+ break;
+
+ case 2:
+ {
+ const q15_t *pCoeffsTmp = pCoeffs;
+ const q15_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 8 coefs
+ */
+ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc1 = vmlaldavaq(acc1, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 8;
+ pCoeffsTmp += 8;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc1, 15);
+ }
+ break;
+
+ case 1:
+ {
+ const q15_t *pCoeffsTmp = pCoeffs;
+ const q15_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+
+ /*
+ * Save 8 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 8;
+ pTempSrc += 8;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 8 coefs
+ */
+ q15x8_t vecCoeffs = *(q15x8_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmlaldavaq(acc0, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 8;
+ pCoeffsTmp += 8;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+
+ *pOutput++ = (q15_t) MVE_ASRL_SAT16(acc0, 15);
+ }
+ break;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps >> 3;
+ while (blkCnt > 0U)
+ {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 8;
+ pTempDest += 8;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vstrhq_p_s16(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+
+#else
+void arm_fir_q15(
+ const arm_fir_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ const q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
+ px = pState;
+
+ /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
+ x0 = read_q15x2_ia (&px);
+
+ /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
+ x2 = read_q15x2_ia (&px);
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* pack x[n-N-1] and x[n-N-2] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read state x[n-N-4], x[n-N-5] */
+ x0 = read_q15x2_ia (&px);
+
+ /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack x[n-N-3] and x[n-N-4] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ /* Read coefficients b[N-2], b[N-3] */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+ acc0 = __SMLALD(x2, c0, acc0);
+
+ /* Read state x[n-N-6], x[n-N-7] with offset */
+ x2 = read_q15x2_ia (&px);
+
+ /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+ acc2 = __SMLALD(x0, c0, acc2);
+
+ /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack x[n-N-5] and x[n-N-6] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ /* Decrement tap count */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+ This is always be 2 taps since the filter length is even. */
+ if ((numTaps & 0x3U) != 0U)
+ {
+ /* Read last two coefficients */
+ c0 = read_q15x2_ia ((q15_t **) &pb);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read last state variables */
+ x0 = read_q15x2 (px);
+
+ /* Perform the multiply-accumulates */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* Perform the multiply-accumulates */
+ acc3 = __SMLALDX(x1, c0, acc3);
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ Then store the 4 outputs in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16));
+#else
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16));
+ write_q15x2_ia (&pDst, __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy two samples into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Use SIMD to hold states and coefficients */
+ px = pState;
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 1U;
+
+ while (tapCnt > 0U)
+ {
+ acc0 += (q31_t) *px++ * *pb++;
+ acc0 += (q31_t) *px++ * *pb++;
+
+ tapCnt--;
+ }
+
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c
new file mode 100644
index 0000000000..088a9b0938
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q31.c
@@ -0,0 +1,1162 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_q31.c
+ * Description: Q31 FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for Q31 FIR filter.
+ @param[in] S points to an instance of the Q31 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+
+ @remark
+ Refer to \ref arm_fir_fast_q31() for a faster but less precise implementation of this filter.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+#define FIR_Q31_CORE(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ q63_t acc[4]; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q31_t)asrl(acc[j], 23); \
+ }
+
+
+#define FIR_Q31_CORE_STR_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *arm_fir_partial_accu_ptr++ = acc[j]; \
+ }
+
+
+#define FIR_Q31_CORE_LD_PARTIAL(nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q31_t *pSmp = &pSamples[j]; \
+ q31x4_t vecIn0; \
+ \
+ acc[j] = *arm_fir_partial_accu_ptr++; \
+ \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 4 * i); \
+ acc[j] = vrmlaldavhaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q31_t)asrl(acc[j], 23); \
+ }
+
+
+#define FIR_Q31_MAIN_CORE() \
+{ \
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4); \
+ q31_t *pState = pRefStatePtr; /* State pointer */ \
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
+ q31_t *pStateCur; /* Points to the current sample of the state */ \
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */ \
+ q31_t *pOutput; /* Temporary pointer to the output buffer */ \
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */ \
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */\
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
+ int32_t blkCnt; \
+ \
+ /* \
+ * load coefs \
+ */ \
+ q31x4_t vecCoeffs[NBVECTAPS]; \
+ \
+ for (int i = 0; i < NBVECTAPS; i++) \
+ vecCoeffs[i] = vld1q(pCoeffs + 4 * i); \
+ \
+ /* \
+ * pState points to state array which contains previous frame (numTaps - 1) samples \
+ * pStateCur points to the location where the new input data should be written \
+ */ \
+ pStateCur = &(pState[(numTaps - 1u)]); \
+ pTempSrc = pSrc; \
+ pSamples = pState; \
+ pOutput = pDst; \
+ \
+ blkCnt = blockSize >> 2; \
+ while (blkCnt > 0) { \
+ /* \
+ * Save 4 input samples in the history buffer \
+ */ \
+ vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc)); \
+ pStateCur += 4; \
+ pTempSrc += 4; \
+ \
+ FIR_Q31_CORE(4, NBVECTAPS, pSamples, vecCoeffs); \
+ \
+ pSamples += 4; \
+ /* \
+ * Decrement the sample block loop counter \
+ */ \
+ blkCnt--; \
+ } \
+ \
+ /* tail */ \
+ int32_t residual = blockSize & 3; \
+ switch (residual) { \
+ case 3: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(3, NBVECTAPS, pSamples, vecCoeffs); \
+ } \
+ break; \
+ \
+ case 2: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(2, NBVECTAPS, pSamples, vecCoeffs); \
+ } \
+ break; \
+ \
+ case 1: \
+ { \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q31_CORE(1, NBVECTAPS, pSamples, vecCoeffs); \
+ } \
+ break; \
+ } \
+ \
+ /* \
+ * Copy the samples back into the history buffer start \
+ */ \
+ pTempSrc = &pState[blockSize]; \
+ pTempDest = pState; \
+ \
+ blkCnt =(numTaps - 1) >> 2; \
+ while (blkCnt > 0) \
+ { \
+ vstrwq_s32(pTempDest, vldrwq_s32(pTempSrc)); \
+ pTempSrc += 4; \
+ pTempDest += 4; \
+ blkCnt--; \
+ } \
+ blkCnt = (numTaps - 1) & 3; \
+ if (blkCnt > 0) \
+ { \
+ mve_pred16_t p0 = vctp32q(blkCnt); \
+ vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p0), p0); \
+ } \
+}
+
+static void arm_fir_q31_1_4_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */
+ q31_t *pOutput; /* Temporary pointer to the output buffer */
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ q31x4_t vecIn0;
+
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pTempSrc = pSrc;
+ pSamples = pState;
+ pOutput = pDst;
+
+ q63_t acc0=0, acc1=0, acc2=0, acc3=0;
+ /*
+ * load 4 coefs
+ */
+ q31x4_t vecCoeffs = *(q31x4_t *) pCoeffs;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ const q31_t *pSamplesTmp = pSamples;
+
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[3]);
+ acc3 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ *pOutput++ = (q31_t) acc2;
+ *pOutput++ = (q31_t) acc3;
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ uint32_t residual = blockSize & 3;
+ switch (residual)
+ {
+ case 3:
+ {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ vecIn0 = vld1q(pSamples);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamples[1]);
+ acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamples[2]);
+ acc2 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ *pOutput++ = (q31_t) acc2;
+ }
+ break;
+
+ case 2:
+ {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ vecIn0 = vld1q(pSamples);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamples[1]);
+ acc1 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ }
+ break;
+
+ case 1:
+ {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ vecIn0 = vld1q(pSamples);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs);
+
+ acc0 = asrl(acc0, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ }
+ break;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = (numTaps-1) >> 2;
+ while (blkCnt > 0U)
+ {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt--;
+ }
+ blkCnt = (numTaps-1) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+
+
+
+static void arm_fir_q31_5_8_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 8
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_9_12_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 12
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_13_16_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 16
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_17_20_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 20
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_21_24_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 24
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+static void arm_fir_q31_25_28_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 28
+ #define NBVECTAPS (NBTAPS / 4)
+ FIR_Q31_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+static void arm_fir_q31_29_32_mve(const arm_fir_instance_q31 * S,
+ const q31_t * __restrict pSrc,
+ q31_t * __restrict pDst,
+ uint32_t blockSize)
+{
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */
+ q31_t *pOutput; /* Temporary pointer to the output buffer */
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t blkCnt;
+ q63_t acc0, acc1, acc2, acc3;
+
+#define MAX_VECT_BATCH 7
+
+ /*
+ * pre-load 28 1st coefs
+ */
+ q31x4_t vecCoeffs0 = vld1q(pCoeffs + 4 * 0);
+ q31x4_t vecCoeffs1 = vld1q(pCoeffs + 4 * 1);
+ q31x4_t vecCoeffs2 = vld1q(pCoeffs + 4 * 2);
+ q31x4_t vecCoeffs3 = vld1q(pCoeffs + 4 * 3);
+ q31x4_t vecCoeffs4 = vld1q(pCoeffs + 4 * 4);
+ q31x4_t vecCoeffs5 = vld1q(pCoeffs + 4 * 5);
+ q31x4_t vecCoeffs6 = vld1q(pCoeffs + 4 * 6);
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pTempSrc = pSrc;
+ pSamples = pState;
+
+ q63_t *arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0) {
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vstrwq_s32(pStateCur, vldrwq_s32(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ const q31_t *pSmp;
+ q31x4_t vecIn0;
+
+ pSmp = &pSamples[0];
+
+ vecIn0 = vld1q(pSmp);
+ acc0 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc0;
+
+ pSmp = &pSamples[1];
+
+ vecIn0 = vld1q(pSmp);
+ acc1 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc1;
+
+ pSmp = &pSamples[2];
+
+ vecIn0 = vld1q(pSmp);
+ acc2 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs6);
+ *arm_fir_partial_accu_ptr++ = acc2;
+
+ pSmp = &pSamples[3];
+
+ vecIn0 = vld1q(pSmp);
+ acc3 = vrmlaldavhq(vecIn0, vecCoeffs0);
+ vecIn0 = vld1q(pSmp + 4 * 1);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs1);
+ vecIn0 = vld1q(pSmp + 4 * 2);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs2);
+ vecIn0 = vld1q(pSmp + 4 * 3);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs3);
+ vecIn0 = vld1q(pSmp + 4 * 4);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs4);
+ vecIn0 = vld1q(pSmp + 4 * 5);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs5);
+ vecIn0 = vld1q(pSmp + 4 * 6);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs6);
+
+ *arm_fir_partial_accu_ptr++ = acc3;
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+
+ /* reminder */
+
+ /* load last 4 coef */
+ vecCoeffs0 = vld1q(pCoeffs + 4 * MAX_VECT_BATCH);
+ arm_fir_partial_accu_ptr = (q63_t*)S->pState;
+ pOutput = pDst;
+ pSamples = pState + (MAX_VECT_BATCH * 4);
+
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0) {
+ q31x4_t vecIn0;
+
+ /* reload intermediate MAC */
+ acc0 = *arm_fir_partial_accu_ptr++;
+ acc1 = *arm_fir_partial_accu_ptr++;
+ acc2 = *arm_fir_partial_accu_ptr++;
+ acc3 = *arm_fir_partial_accu_ptr++;
+
+
+ vecIn0 = vld1q(&pSamples[0]);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs0);
+
+ vecIn0 = vld1q(&pSamples[3]);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs0);
+
+ *pOutput++ = asrl(acc0, 23);
+ *pOutput++ = asrl(acc1, 23);
+ *pOutput++ = asrl(acc2, 23);
+ *pOutput++ = asrl(acc3, 23);
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps - 1;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+
+ vstrwq_p_s32(pTempDest, vldrwq_z_s32(pTempSrc, p), p);
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+}
+
+
+
+void arm_fir_q31(
+ const arm_fir_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pRefStatePtr = S->pState + 2*ROUND_UP(blockSize, 4);
+ q31_t *pState = pRefStatePtr; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCur; /* Points to the current sample of the state */
+ const q31_t *pSamples; /* Temporary pointer to the sample buffer */
+ q31_t *pOutput; /* Temporary pointer to the output buffer */
+ const q31_t *pTempSrc; /* Temporary pointer to the source data */
+ q31_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ q31x4_t vecIn0;
+ uint32_t tapsBlkCnt = (numTaps + 3) / 4;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecCoeffs;
+
+
+ /*
+ * [1 to 32 taps] specialized routines
+ */
+ if (numTaps <= 4)
+ {
+ arm_fir_q31_1_4_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 8)
+ {
+ arm_fir_q31_5_8_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 12)
+ {
+ arm_fir_q31_9_12_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 16)
+ {
+ arm_fir_q31_13_16_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 20)
+ {
+ arm_fir_q31_17_20_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 24)
+ {
+ arm_fir_q31_21_24_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 28)
+ {
+ arm_fir_q31_25_28_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if ((numTaps <= 32) && (blockSize >= 32))
+ {
+ arm_fir_q31_29_32_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pSamples = pState;
+ pTempSrc = pSrc;
+ pOutput = pDst;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0)
+ {
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 4 coefs
+ */
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[3]);
+ acc3 = vrmlaldavhaq(acc3, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+
+ /* .54-> .31 conversion and store accumulators */
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ *pOutput++ = (q31_t) acc2;
+ *pOutput++ = (q31_t) acc3;
+
+ pSamples += 4;
+
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ int32_t residual = blockSize & 3;
+ switch (residual)
+ {
+ case 3:
+ {
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ *(q31x4_t *) pStateCur = *(q31x4_t *) pTempSrc;
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vrmlaldavhaq(acc2, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ i--;
+ }
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ *pOutput++ = (q31_t) acc2;
+ }
+ break;
+
+ case 2:
+ {
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vrmlaldavhaq(acc1, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ i--;
+ }
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ *pOutput++ = (q31_t) acc1;
+ }
+ break;
+
+ case 1:
+ {
+ const q31_t *pCoeffsTmp = pCoeffs;
+ const q31_t *pSamplesTmp = pSamples;
+
+ acc0 = 0LL;
+
+ /*
+ * Save 4 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 4;
+ pTempSrc += 4;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q31x4_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vrmlaldavhaq(acc0, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 4;
+ pCoeffsTmp += 4;
+ i--;
+ }
+
+ acc0 = asrl(acc0, 23);
+
+ *pOutput++ = (q31_t) acc0;
+ }
+ break;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = (numTaps - 1U) >> 2;
+ while (blkCnt > 0)
+ {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 4;
+ pTempDest += 4;
+ blkCnt--;
+ }
+ blkCnt = (numTaps - 1U) & 3;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vstrwq_p_s32(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+
+#else
+void arm_fir_q31(
+ const arm_fir_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px; /* Temporary pointer for state buffer */
+ const q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q63_t acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2; /* Temporary variables to hold state values */
+ q31_t c0; /* Temporary variable to hold coefficient value */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize / 3;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 3 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first 2 samples from the state buffer: x[n-numTaps], x[n-numTaps-1] */
+ x0 = *px++;
+ x1 = *px++;
+
+ /* Loop unrolling: process 3 taps at a time. */
+ tapCnt = numTaps / 3;
+
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *pb;
+
+ /* Read x[n-numTaps-2] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x0 * c0);
+ acc1 += ((q63_t) x1 * c0);
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read the coefficient and state */
+ c0 = *(pb + 1U);
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x1 * c0);
+ acc1 += ((q63_t) x2 * c0);
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read the coefficient and state */
+ c0 = *(pb + 2U);
+ x1 = *(px++);
+
+ /* update coefficient pointer */
+ pb += 3U;
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x2 * c0);
+ acc1 += ((q63_t) x0 * c0);
+ acc2 += ((q63_t) x1 * c0);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ tapCnt = numTaps % 0x3U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x0 * c0);
+ acc1 += ((q63_t) x1 * c0);
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 3 to process the next group of 3 samples */
+ pState = pState + 3;
+
+ /* The result is in 2.30 format. Convert to 1.31 and store in destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31U);
+ *pDst++ = (q31_t) (acc1 >> 31U);
+ *pDst++ = (q31_t) (acc2 >> 31U);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x3U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc0 += (q63_t) *px++ * *pb++;
+
+ i--;
+ } while (i > 0U);
+
+ /* Result is in 2.62 format. Convert to 1.31 and store in destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31U);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c
new file mode 100644
index 0000000000..c05fa32146
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_q7.c
@@ -0,0 +1,714 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_q7.c
+ * Description: Q7 FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR
+ @{
+ */
+
+/**
+ @brief Processing function for Q7 FIR filter.
+ @param[in] S points to an instance of the Q7 FIR filter structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ The accumulator is converted to 18.7 format by discarding the low 7 bits.
+ Finally, the result is truncated to 1.7 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define FIR_Q7_CORE(pOutput, nbAcc, nbVecTaps, pSample, vecCoeffs) \
+ for (int j = 0; j < nbAcc; j++) { \
+ const q7_t *pSmp = &pSample[j]; \
+ q31_t acc[4]; \
+ \
+ acc[j] = 0; \
+ for (int i = 0; i < nbVecTaps; i++) { \
+ vecIn0 = vld1q(pSmp + 16 * i); \
+ acc[j] = vmladavaq(acc[j], vecIn0, vecCoeffs[i]); \
+ } \
+ *pOutput++ = (q7_t) __SSAT((acc[j] >> 7U), 8); \
+ }
+
+#define FIR_Q7_MAIN_CORE() \
+{ \
+ q7_t *pState = S->pState; /* State pointer */ \
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */ \
+ q7_t *pStateCur; /* Points to the current sample of the state */ \
+ const q7_t *pSamples; /* Temporary pointer to the sample buffer */ \
+ q7_t *pOutput; /* Temporary pointer to the output buffer */ \
+ const q7_t *pTempSrc; /* Temporary pointer to the source data */ \
+ q7_t *pTempDest; /* Temporary pointer to the destination buffer */\
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */\
+ int32_t blkCnt; \
+ q7x16_t vecIn0; \
+ \
+ /* \
+ * load coefs \
+ */ \
+ q7x16_t vecCoeffs[NBVECTAPS]; \
+ \
+ for (int i = 0; i < NBVECTAPS; i++) \
+ vecCoeffs[i] = vldrbq_s8(pCoeffs + 16 * i); \
+ \
+ /* \
+ * pState points to state array which contains previous frame (numTaps - 1) samples \
+ * pStateCur points to the location where the new input data should be written \
+ */ \
+ pStateCur = &(pState[(numTaps - 1u)]); \
+ pTempSrc = pSrc; \
+ pSamples = pState; \
+ pOutput = pDst; \
+ \
+ blkCnt = blockSize >> 2; \
+ while (blkCnt > 0) { \
+ /* \
+ * Save 4 input samples in the history buffer \
+ */ \
+ vstrbq_s32(pStateCur, vldrbq_s32(pTempSrc)); \
+ pStateCur += 4; \
+ pTempSrc += 4; \
+ \
+ FIR_Q7_CORE(pOutput, 4, NBVECTAPS, pSamples, vecCoeffs); \
+ pSamples += 4; \
+ \
+ blkCnt--; \
+ } \
+ \
+ /* tail */ \
+ int32_t residual = blockSize & 3; \
+ \
+ for (int i = 0; i < residual; i++) \
+ *pStateCur++ = *pTempSrc++; \
+ \
+ FIR_Q7_CORE(pOutput, residual, NBVECTAPS, pSamples, vecCoeffs); \
+ \
+ \
+ /* \
+ * Copy the samples back into the history buffer start \
+ */ \
+ pTempSrc = &pState[blockSize]; \
+ pTempDest = pState; \
+ blkCnt = numTaps - 1; \
+ do { \
+ mve_pred16_t p = vctp8q(blkCnt); \
+ \
+ vstrbq_p_s8(pTempDest, vldrbq_z_s8(pTempSrc, p), p); \
+ pTempSrc += 16; \
+ pTempDest += 16; \
+ blkCnt -= 16; \
+ } \
+ while (blkCnt > 0); \
+}
+
+
+static void arm_fir_q7_49_64_mve(const arm_fir_instance_q7 * S,
+ const q7_t * __restrict pSrc,
+ q7_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 64
+ #define NBVECTAPS (NBTAPS / 16)
+ FIR_Q7_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+void arm_fir_q7_33_48_mve(const arm_fir_instance_q7 * S,
+ const q7_t * __restrict pSrc,
+ q7_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 48
+ #define NBVECTAPS (NBTAPS / 16)
+ FIR_Q7_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+static void arm_fir_q7_17_32_mve(const arm_fir_instance_q7 * S,
+ const q7_t * __restrict pSrc,
+ q7_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 32
+ #define NBVECTAPS (NBTAPS / 16)
+ FIR_Q7_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+
+void arm_fir_q7_1_16_mve(const arm_fir_instance_q7 * S,
+ const q7_t * __restrict pSrc,
+ q7_t * __restrict pDst, uint32_t blockSize)
+{
+ #define NBTAPS 16
+ #define NBVECTAPS (NBTAPS / 16)
+ FIR_Q7_MAIN_CORE();
+ #undef NBVECTAPS
+ #undef NBTAPS
+}
+
+void arm_fir_q7(
+ const arm_fir_instance_q7 * S,
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ q7_t *pState = S->pState; /* State pointer */
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *pStateCur; /* Points to the current sample of the state */
+ const q7_t *pSamples; /* Temporary pointer to the sample buffer */
+ q7_t *pOutput; /* Temporary pointer to the output buffer */
+ const q7_t *pTempSrc; /* Temporary pointer to the source data */
+ q7_t *pTempDest; /* Temporary pointer to the destination buffer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t blkCnt;
+ q7x16_t vecIn0;
+ uint32_t tapsBlkCnt = (numTaps + 15) / 16;
+ q31_t acc0, acc1, acc2, acc3;
+ q7x16_t vecCoeffs;
+
+ if (numTaps <= 16)
+ {
+ /*
+ * [1 to 16 taps] specialized routine
+ */
+ arm_fir_q7_1_16_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 32)
+ {
+ /*
+ * [17 to 32 taps] specialized routine
+ */
+ arm_fir_q7_17_32_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 48)
+ {
+ /*
+ * [33 to 48 taps] specialized routine
+ */
+ arm_fir_q7_33_48_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+ else if (numTaps <= 64)
+ {
+ /*
+ * [49 to 64 taps] specialized routine
+ */
+ arm_fir_q7_49_64_mve(S, pSrc, pDst, blockSize);
+ return;
+ }
+
+ /*
+ * pState points to state array which contains previous frame (numTaps - 1) samples
+ * pStateCur points to the location where the new input data should be written
+ */
+ pStateCur = &(pState[(numTaps - 1u)]);
+ pSamples = pState;
+ pTempSrc = pSrc;
+ pOutput = pDst;
+ blkCnt = blockSize >> 2;
+
+ /*
+ * outer samples loop
+ */
+ while (blkCnt > 0U)
+ {
+ const q7_t *pCoeffsTmp = pCoeffs;
+ const q7_t *pSamplesTmp = pSamples;
+
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+ /*
+ * Save 16 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 16;
+ pTempSrc += 16;
+
+ /*
+ * inner coefficients loop
+ */
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ /*
+ * load 16 coefs
+ */
+ vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[1]);
+ acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[2]);
+ acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[3]);
+ acc3 = vmladavaq(acc3, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 16;
+ pCoeffsTmp += 16;
+ /*
+ * Decrement the taps block loop counter
+ */
+ i--;
+ }
+ /*
+ * Store the 1.7 format filter output in destination buffer
+ */
+ *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc3 >> 7U), 8);
+
+ pSamples += 4;
+ /*
+ * Decrement the sample block loop counter
+ */
+ blkCnt--;
+ }
+
+ uint32_t residual = blockSize & 3;
+ switch (residual)
+ {
+ case 3:
+ {
+ const q7_t *pCoeffsTmp = pCoeffs;
+ const q7_t *pSamplesTmp = pSamples;
+
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ /*
+ * Save 16 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 16;
+ pTempSrc += 16;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[4]);
+ acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[8]);
+ acc2 = vmladavaq(acc2, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 16;
+ pCoeffsTmp += 16;
+ i--;
+ }
+
+ *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc2 >> 7U), 8);
+ }
+ break;
+
+ case 2:
+ {
+ const q7_t *pCoeffsTmp = pCoeffs;
+ const q7_t *pSamplesTmp = pSamples;
+
+ acc0 = 0;
+ acc1 = 0;
+ /*
+ * Save 16 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 16;
+ pTempSrc += 16;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+ vecIn0 = vld1q(&pSamplesTmp[4]);
+ acc1 = vmladavaq(acc1, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 16;
+ pCoeffsTmp += 16;
+ i--;
+ }
+
+ *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
+ *pOutput++ = (q7_t) __SSAT((acc1 >> 7U), 8);
+ }
+ break;
+
+ case 1:
+ {
+ const q7_t *pCoeffsTmp = pCoeffs;
+ const q7_t *pSamplesTmp = pSamples;
+
+ acc0 = 0;
+ /*
+ * Save 16 input samples in the history buffer
+ */
+ vst1q(pStateCur, vld1q(pTempSrc));
+ pStateCur += 16;
+ pTempSrc += 16;
+
+ int i = tapsBlkCnt;
+ while (i > 0)
+ {
+ vecCoeffs = *(q7x16_t *) pCoeffsTmp;
+
+ vecIn0 = vld1q(pSamplesTmp);
+ acc0 = vmladavaq(acc0, vecIn0, vecCoeffs);
+
+ pSamplesTmp += 16;
+ pCoeffsTmp += 16;
+ i--;
+ }
+ *pOutput++ = (q7_t) __SSAT((acc0 >> 7U), 8);
+ }
+ break;
+ }
+
+ /*
+ * Copy the samples back into the history buffer start
+ */
+ pTempSrc = &pState[blockSize];
+ pTempDest = pState;
+
+ blkCnt = numTaps >> 4;
+ while (blkCnt > 0U)
+ {
+ vst1q(pTempDest, vld1q(pTempSrc));
+ pTempSrc += 16;
+ pTempDest += 16;
+ blkCnt--;
+ }
+ blkCnt = numTaps & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+ vstrbq_p_s8(pTempDest, vld1q(pTempSrc), p0);
+ }
+}
+#else
+void arm_fir_q7(
+ const arm_fir_instance_q7 * S,
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ q7_t *pState = S->pState; /* State pointer */
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *pStateCurnt; /* Points to the current sample of the state */
+ q7_t *px; /* Temporary pointer for state buffer */
+ const q7_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t acc1, acc2, acc3; /* Accumulators */
+ q7_t x0, x1, x2, x3, c0; /* Temporary variables to hold state */
+#endif
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy 4 new input samples into the state buffer. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first 3 samples from the state buffer:
+ * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ Repeat until we've computed numTaps-4 coefficients. */
+ while (tapCnt > 0U)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *pb;
+
+ /* Read x[n-numTaps-3] sample */
+ x3 = *px;
+
+ /* acc0 += b[numTaps] * x[n-numTaps] */
+ acc0 += ((q15_t) x0 * c0);
+
+ /* acc1 += b[numTaps] * x[n-numTaps-1] */
+ acc1 += ((q15_t) x1 * c0);
+
+ /* acc2 += b[numTaps] * x[n-numTaps-2] */
+ acc2 += ((q15_t) x2 * c0);
+
+ /* acc3 += b[numTaps] * x[n-numTaps-3] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb + 1U);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px + 1U);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x1 * c0);
+ acc1 += ((q15_t) x2 * c0);
+ acc2 += ((q15_t) x3 * c0);
+ acc3 += ((q15_t) x0 * c0);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb + 2U);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px + 2U);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x2 * c0);
+ acc1 += ((q15_t) x3 * c0);
+ acc2 += ((q15_t) x0 * c0);
+ acc3 += ((q15_t) x1 * c0);
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *(pb + 3U);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px + 3U);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x3 * c0);
+ acc1 += ((q15_t) x0 * c0);
+ acc2 += ((q15_t) x1 * c0);
+ acc3 += ((q15_t) x2 * c0);
+
+ /* update coefficient pointer */
+ pb += 4U;
+ px += 4U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x0 * c0);
+ acc1 += ((q15_t) x1 * c0);
+ acc2 += ((q15_t) x2 * c0);
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
+ Then store the 4 outputs in the destination buffer. */
+ acc0 = __SSAT((acc0 >> 7U), 8);
+ *pDst++ = acc0;
+ acc1 = __SSAT((acc1 >> 7U), 8);
+ *pDst++ = acc1;
+ acc2 = __SSAT((acc2 >> 7U), 8);
+ *pDst++ = acc2;
+ acc3 = __SSAT((acc3 >> 7U), 8);
+ *pDst++ = acc3;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining output samples */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of taps */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ while (i > 0U)
+ {
+ acc0 += (q15_t) * (px++) * (*(pb++));
+ i--;
+ }
+
+ /* The result is in 2.14 format. Convert to 1.7
+ Then store the output in the destination buffer. */
+ *pDst++ = __SSAT((acc0 >> 7U), 8);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of taps */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ /* Copy remaining data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of FIR group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
new file mode 100644
index 0000000000..e6f4d78c84
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_f32.c
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_f32.c
+ * Description: Floating-point sparse FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters
+
+ This group of functions implements sparse FIR filters.
+ Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero.
+ Sparse filters are used for simulating reflections in communications and audio applications.
+
+ There are separate functions for Q7, Q15, Q31, and floating-point data types.
+ The functions operate on blocks of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst points to input and output arrays respectively containing blockSize values.
+
+ @par Algorithm
+ The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients.
+ This is in addition to the coefficient array b.
+ The implementation essentially skips the multiplications by zero and leads to an efficient realization.
+
+ y[n] = b[0] * x[n-pTapDelay[0]] + b[1] * x[n-pTapDelay[1]] + b[2] * x[n-pTapDelay[2]] + ...+ b[numTaps-1] * x[n-pTapDelay[numTaps-1]]
+
+ @par
+ \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients"
+ @par
+ pCoeffs points to a coefficient array of size numTaps;
+ pTapDelay points to an array of nonzero indices and is also of size numTaps;
+ pState points to a state array of size maxDelay + blockSize, where
+ maxDelay is the largest offset value that is ever used in the pTapDelay array.
+ Some of the processing functions also require temporary working buffers.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 4 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, pTapDelay, maxDelay, stateIndex, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 4 different data type filter instance structures
+
+ arm_fir_sparse_instance_f32 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ arm_fir_sparse_instance_q31 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ arm_fir_sparse_instance_q15 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+ arm_fir_sparse_instance_q7 S = {numTaps, 0, pState, pCoeffs, maxDelay, pTapDelay};
+
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the sparse FIR filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point sparse FIR filter.
+ @param[in] S points to an instance of the floating-point sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
+ */
+
+void arm_fir_sparse_f32(
+ arm_fir_sparse_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ float32_t * pScratchIn,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ const float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* Scratch buffer pointer */
+ float32_t *py = pState; /* Temporary pointers for state buffer */
+ float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ float32_t *pOut; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1, (int32_t *) pSrc, 1, blockSize);
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+
+ *pOut++ = *px++ * coeff;
+
+ *pOut++ = *px++ * coeff;
+
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplication and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ *pOut++ += *px++ * coeff;
+
+ *pOut++ += *px++ * coeff;
+
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement tap loop counter */
+ tapCnt--;
+ }
+
+ /* Compute last tap without the final read of pTapDelay */
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
new file mode 100644
index 0000000000..c18e0bdce6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_init_f32.c
+ * Description: Floating-point sparse FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point sparse FIR filter.
+ @param[in,out] S points to an instance of the floating-point sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of samples processed by the arm_fir_sparse_f32() function.
+ */
+
+void arm_fir_sparse_init_f32(
+ arm_fir_sparse_instance_f32 * S,
+ uint16_t numTaps,
+ const float32_t * pCoeffs,
+ float32_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0U;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
new file mode 100644
index 0000000000..6a48743ed2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_init_q15.c
+ * Description: Q15 sparse FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q15 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of words processed by arm_fir_sparse_q15() function.
+ */
+
+void arm_fir_sparse_init_q15(
+ arm_fir_sparse_instance_q15 * S,
+ uint16_t numTaps,
+ const q15_t * pCoeffs,
+ q15_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0U;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
new file mode 100644
index 0000000000..a333f1cc85
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_init_q31.c
+ * Description: Q31 sparse FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q31 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the number of words processed by arm_fir_sparse_q31() function.
+ */
+
+void arm_fir_sparse_init_q31(
+ arm_fir_sparse_instance_q31 * S,
+ uint16_t numTaps,
+ const q31_t * pCoeffs,
+ q31_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0U;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
new file mode 100644
index 0000000000..3e2b9c9a2a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_init_q7.c
+ * Description: Q7 sparse FIR filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q7 sparse FIR filter.
+ @param[in,out] S points to an instance of the Q7 sparse FIR structure
+ @param[in] numTaps number of nonzero coefficients in the filter
+ @param[in] pCoeffs points to the array of filter coefficients
+ @param[in] pState points to the state buffer
+ @param[in] pTapDelay points to the array of offset times
+ @param[in] maxDelay maximum offset time supported
+ @param[in] blockSize number of samples that will be processed per block
+ @return none
+
+ @par Details
+ pCoeffs holds the filter coefficients and has length numTaps.
+ pState holds the filter's state variables and must be of length
+ maxDelay + blockSize, where maxDelay
+ is the maximum number of delay line values.
+ blockSize is the
+ number of samples processed by the arm_fir_sparse_q7() function.
+ */
+
+void arm_fir_sparse_init_q7(
+ arm_fir_sparse_instance_q7 * S,
+ uint16_t numTaps,
+ const q7_t * pCoeffs,
+ q7_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0U;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q7_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
new file mode 100644
index 0000000000..606692255c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q15.c
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_q15.c
+ * Description: Q15 sparse FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 sparse FIR filter.
+ @param[in] S points to an instance of the Q15 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] pScratchOut points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process per call
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 32-bit accumulator.
+ The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
+ Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
+ If the accumulator result overflows it will wrap around rather than saturate.
+ After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
+ In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ */
+
+void arm_fir_sparse_q15(
+ arm_fir_sparse_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ q15_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ const q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* Temporary pointers for scratch buffer */
+ q15_t *py = pState; /* Temporary pointers for state buffer */
+ q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q15_t *pOut = pDst; /* Working pointer for output */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
+ q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in1, in2; /* Temporary variables */
+#endif
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q15(py, (int32_t) delaySize, &S->stateIndex, 1,pSrc, 1, blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Compute last tap without the final read of pTapDelay */
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) *px++ * coeff;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ in1 = *pScr2++;
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ in1 = *pScr2++;
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16), 16));
+#else
+ write_q15x2_ia (&pOut, __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16), 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
new file mode 100644
index 0000000000..33c66df47d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q31.c
@@ -0,0 +1,357 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_q31.c
+ * Description: Q31 sparse FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 sparse FIR filter.
+ @param[in] S points to an instance of the Q31 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 32-bit accumulator.
+ The 1.31 x 1.31 multiplications are truncated to 2.30 format.
+ This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
+ If the accumulator result overflows, it wraps around rather than saturate.
+ In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ */
+
+void arm_fir_sparse_q31(
+ arm_fir_sparse_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ q31_t * pScratchIn,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ const q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* Scratch buffer pointer */
+ q31_t *py = pState; /* Temporary pointers for state buffer */
+ q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q31_t *pOut; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
+ q31_t in;
+ q63_t out; /* Temporary output variable */
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+ (int32_t *) pSrc, 1, blockSize);
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplication and store in destination buffer */
+ *pOut++ = (q31_t) (((q63_t) *px++ * coeff) >> 32);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement tap loop counter */
+ tapCnt--;
+ }
+
+ /* Compute last tap without the final read of pTapDelay */
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) *px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Working output pointer is updated */
+ pOut = pDst;
+
+ /* Output is converted into 1.31 format. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
new file mode 100644
index 0000000000..e47d889c28
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_fir_sparse_q7.c
@@ -0,0 +1,341 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fir_sparse_q7.c
+ * Description: Q7 sparse FIR filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup FIR_Sparse
+ @{
+ */
+
+/**
+ @brief Processing function for the Q7 sparse FIR filter.
+ @param[in] S points to an instance of the Q7 sparse FIR structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] pScratchIn points to a temporary buffer of size blockSize
+ @param[in] pScratchOut points to a temporary buffer of size blockSize
+ @param[in] blockSize number of input samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ The accumulator is then converted to 18.7 format by discarding the low 7 bits.
+ Finally, the result is truncated to 1.7 format.
+ */
+
+void arm_fir_sparse_q7(
+ arm_fir_sparse_instance_q7 * S,
+ const q7_t * pSrc,
+ q7_t * pDst,
+ q7_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
+{
+ q7_t *pState = S->pState; /* State pointer */
+ const q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *px; /* Scratch buffer pointer */
+ q7_t *py = pState; /* Temporary pointers for state buffer */
+ q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q7_t *pOut = pDst; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
+ q31_t in;
+ q7_t coeff = *pCoeffs++; /* Read the coefficient value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q7_t in1, in2, in3, in4;
+#endif
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1, blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) *px++ * coeff);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if (readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Compute last tap without the final read of pTapDelay */
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1,
+ pb, pb, (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) *px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time. */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+ write_q7x4_ia (&pOut, __PACKq7(in1, in2, in3, in4));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+/**
+ @} end of FIR_Sparse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
new file mode 100644
index 0000000000..6e41d83d3c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_f32.c
@@ -0,0 +1,354 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_f32.c
+ * Description: Floating-point IIR Lattice filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
+
+ This set of functions implements lattice filters
+ for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ variety of adaptive filter applications. The filter structure has feedforward and
+ feedback components and the net impulse response is infinite length.
+ The functions operate on blocks
+ of input and output data and each call to the function processes
+ blockSize samples through the filter. pSrc and
+ pDst point to input and output arrays containing blockSize values.
+
+ @par Algorithm
+ \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
+ @par
+
+ fN(n) = x(n)
+ fm-1(n) = fm(n) - km * gm-1(n-1) for m = N, N-1, ..., 1
+ gm(n) = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ..., 1
+ y(n) = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+
+ @par
+ pkCoeffs points to array of reflection coefficients of size numStages.
+ Reflection Coefficients are stored in time-reversed order.
+ @par
+
+ {kN, kN-1, ..., k1}
+
+ @par
+ pvCoeffs points to the array of ladder coefficients of size (numStages+1).
+ Ladder coefficients are stored in time-reversed order.
+
+ {vN, vN-1, ..., v0}
+
+ @par
+ pState points to a state array of size numStages + blockSize.
+ The state variables shown in the figure above (the g values) are stored in the pState array.
+ The state variables are updated after each block of data is processed; the coefficients are untouched.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter.
+ Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numStages, pkCoeffs, pvCoeffs, pState. Also set all of the values in pState to zero.
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+
+ arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+
+ @par
+ where numStages is the number of stages in the filter; pState points to the state buffer array;
+ pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point IIR lattice filter.
+ @param[in] S points to an instance of the floating-point IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_iir_lattice_f32(
+ const arm_iir_lattice_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pStateCur; /* State current pointer */
+ float32_t acc; /* Accumlator */
+ float32_t fnext1, fnext2, gcurr1, gnext; /* Temporary variables for lattice stages */
+ float32_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t gcurr2; /* Temporary variables for lattice stages */
+ float32_t k1, k2;
+ float32_t v1, v2, v3, v4;
+#endif
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ /* Sample processing */
+ while (blkCnt > 0U)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fnext2 = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0.0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Read gN-1(n-1) from state buffer */
+ gcurr1 = *px1;
+
+ /* read reflection coefficient kN */
+ k1 = *pk;
+
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext1 = fnext2 - (k1 * gcurr1);
+
+ /* read ladder coefficient vN */
+ v1 = *pv;
+
+ /* read next reflection coefficient kN-1 */
+ k2 = *(pk + 1U);
+
+ /* Read gN-2(n-1) from state buffer */
+ gcurr2 = *(px1 + 1U);
+
+ /* read next ladder coefficient vN-1 */
+ v2 = *(pv + 1U);
+
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext2 = fnext1 - (k2 * gcurr2);
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = gcurr1 + (k1 * fnext1);
+
+ /* read reflection coefficient kN-2 */
+ k1 = *(pk + 2U);
+
+ /* write gN(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Read gN-3(n-1) from state buffer */
+ gcurr1 = *(px1 + 2U);
+
+ /* y(n) += gN(n) * vN */
+ acc += (gnext * v1);
+
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fnext1 = fnext2 - (k1 * gcurr1);
+
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = gcurr2 + (k2 * fnext2);
+
+ /* Read gN-4(n-1) from state buffer */
+ gcurr2 = *(px1 + 3U);
+
+ /* y(n) += gN-1(n) * vN-1 */
+ acc += (gnext * v2);
+
+ /* read reflection coefficient kN-3 */
+ k2 = *(pk + 3U);
+
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext2 = fnext1 - (k2 * gcurr2);
+
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = gcurr1 + (k1 * fnext1);
+
+ /* read ladder coefficient vN-2 */
+ v3 = *(pv + 2U);
+
+ /* y(n) += gN-2(n) * vN-2 */
+ acc += (gnext * v3);
+
+ /* write gN-2(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* update pointer */
+ pk += 4U;
+
+ /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+ gnext = (fnext2 * k2) + gcurr2;
+
+ /* read next ladder coefficient vN-3 */
+ v4 = *(pv + 3U);
+
+ /* y(n) += gN-4(n) * vN-4 */
+ acc += (gnext * v4);
+
+ /* write gN-3(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* update pointers */
+ px1 += 4U;
+ pv += 4U;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numStages;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ gcurr1 = *px1++;
+ /* Process sample for last taps */
+ fnext1 = fnext2 - ((*pk) * gcurr1);
+ gnext = (fnext1 * (*pk++)) + gcurr1;
+ /* Output samples for last taps */
+ acc += (gnext * (*pv++));
+ *px2++ = gnext;
+ fnext2 = fnext1;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (fnext2 * (*pv));
+
+ *px2++ = fnext2;
+
+ /* write out into pDst */
+ *pDst++ = acc;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCur = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ /* Copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numStages >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = numStages;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
new file mode 100644
index 0000000000..389e1ea8fd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_init_f32.c
+ * Description: Floating-point IIR lattice filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point IIR lattice filter.
+ @param[in] S points to an instance of the floating-point IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_iir_lattice_init_f32(
+ arm_iir_lattice_instance_f32 * S,
+ uint16_t numStages,
+ float32_t * pkCoeffs,
+ float32_t * pvCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
new file mode 100644
index 0000000000..873c342ae5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_init_q15.c
+ * Description: Q15 IIR lattice filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 IIR lattice filter.
+ @param[in] S points to an instance of the Q15 IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_iir_lattice_init_q15(
+ arm_iir_lattice_instance_q15 * S,
+ uint16_t numStages,
+ q15_t * pkCoeffs,
+ q15_t * pvCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
new file mode 100644
index 0000000000..59b26d3c71
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_init_q31.c
+ * Description: Initialization function for the Q31 IIR lattice filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 IIR lattice filter.
+ @param[in] S points to an instance of the Q31 IIR lattice structure
+ @param[in] numStages number of stages in the filter
+ @param[in] pkCoeffs points to reflection coefficient buffer. The array is of length numStages
+ @param[in] pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1
+ @param[in] pState points to state buffer. The array is of length numStages+blockSize
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+void arm_iir_lattice_init_q31(
+ arm_iir_lattice_instance_q31 * S,
+ uint16_t numStages,
+ q31_t * pkCoeffs,
+ q31_t * pvCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
new file mode 100644
index 0000000000..d3cf7f668f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q15.c
@@ -0,0 +1,396 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_q15.c
+ * Description: Q15 IIR Lattice filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 IIR lattice filter.
+ @param[in] S points to an instance of the Q15 IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+
+void arm_iir_lattice_q15(
+ const arm_iir_lattice_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pStateCur; /* State current pointer */
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q63_t acc; /* Accumlator */
+ q15_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+ q15_t out; /* Temporary variable for output */
+
+#if defined (ARM_MATH_DSP) && defined (ARM_MATH_LOOPUNROLL)
+ q15_t gnext1, gnext2; /* Temporary variables for lattice stages */
+ q31_t v; /* Temporary variable for ladder coefficient */
+#endif
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+#if defined (ARM_MATH_DSP)
+
+ /* Sample processing */
+ while (blkCnt > 0U)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
+ /* Process sample for first tap */
+ gcurr = *px1++;
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+
+ /* write gN(n) into state for next sample processing */
+ *px2++ = (q15_t) gnext;
+
+ /* y(n) += gN(n) * vN */
+ acc += (q31_t) ((gnext * (*pv++)));
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Process sample for 2nd, 6th ...taps */
+ /* Read gN-2(n-1) from state buffer */
+ gcurr = *px1++;
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext1 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = (q15_t) gnext1;
+
+ /* Process sample for 3nd, 7th ...taps */
+ /* Read gN-3(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 3rd, 7th .. taps */
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15);
+ fcurr = __SSAT(fcurr, 16);
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr;
+ gnext2 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-2(n) into state */
+ *px2++ = (q15_t) gnext2;
+
+ /* Read vN-1 and vN-2 at a time */
+ v = read_q15x2_ia (&pv);
+
+ /* Pack gN-1(n) and gN-2(n) */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ gnext = __PKHBT(gnext1, gnext2, 16);
+#else
+ gnext = __PKHBT(gnext2, gnext1, 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* y(n) += gN-1(n) * vN-1 */
+ /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+ /* y(n) += gN-2(n) * vN-2 */
+ /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+ acc = __SMLALD(gnext, v, acc);
+
+ /* Process sample for 4th, 8th ...taps */
+ /* Read gN-4(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 4th, 8th .. taps */
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN-3(n) = kN-3 * fN-1(n) + gN-1(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext1 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-3(n) for the next sample process */
+ *px2++ = (q15_t) gnext1;
+
+ /* Process sample for 5th, 9th ...taps */
+ /* Read gN-5(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 5th, 9th .. taps */
+ /* fN-5(n) = fN-4(n) - kN-4 * gN-5(n-1) */
+ fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15);
+ fcurr = __SSAT(fcurr, 16);
+ /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+ gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr;
+ gnext2 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-4(n) for the next sample process */
+ *px2++ = (q15_t) gnext2;
+
+ /* Read vN-3 and vN-4 at a time */
+ v = read_q15x2_ia (&pv);
+
+ /* Pack gN-3(n) and gN-4(n) */
+#ifndef ARM_MATH_BIG_ENDIAN
+ gnext = __PKHBT(gnext1, gnext2, 16);
+#else
+ gnext = __PKHBT(gnext2, gnext1, 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* y(n) += gN-4(n) * vN-4 */
+ /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+ /* y(n) += gN-3(n) * vN-3 */
+ /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+ acc = __SMLALD(gnext, v, acc);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ fnext = fcurr;
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numStages - 1U) % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+
+ /* Output samples for last taps */
+ acc += (q31_t) (((q31_t) gnext * (*pv++)));
+ *px2++ = (q15_t) gnext;
+ fcurr = fnext;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q31_t) (((q31_t) fnext * (*pv++)));
+
+ out = (q15_t) __SSAT(acc >> 15, 16);
+ *px2++ = (q15_t) fnext;
+
+ /* write out into pDst */
+ *pDst++ = out;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCur = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numStages >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCur, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* Sample processing */
+ while (blkCnt > 0U)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
+ tapCnt = numStages;
+
+ while (tapCnt > 0U)
+ {
+ gcurr = *px1++;
+ /* Process sample */
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = fcurr - ((gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = ((fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+
+ /* Output samples */
+ /* y(n) += gN(n) * vN */
+ acc += (q31_t) ((gnext * (*pv++)));
+
+ /* write gN(n) into state for next sample processing */
+ *px2++ = (q15_t) gnext;
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q31_t) ((fnext * (*pv++)));
+
+ out = (q15_t) __SSAT(acc >> 15, 16);
+ *px2++ = (q15_t) fnext;
+
+ /* write out into pDst */
+ *pDst++ = out;
+
+ /* Advance the state pointer by 1 to process the next group of samples */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCur = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
new file mode 100644
index 0000000000..5c95fb4d87
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_iir_lattice_q31.c
@@ -0,0 +1,356 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_iir_lattice_q31.c
+ * Description: Q31 IIR Lattice filter processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup IIR_Lattice
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 IIR lattice filter.
+ @param[in] S points to an instance of the Q31 IIR lattice structure
+ @param[in] pSrc points to the block of input data
+ @param[out] pDst points to the block of output data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits.
+ After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format.
+ */
+
+void arm_iir_lattice_q31(
+ const arm_iir_lattice_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pStateCur; /* State current pointer */
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q63_t acc; /* Accumlator */
+ q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* Number of stages */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+#if defined (ARM_MATH_DSP)
+
+ /* Sample processing */
+ while (blkCnt > 0U)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
+ /* Process sample for first tap */
+ gcurr = *px1++;
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+
+ /* write gN-1(n-1) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* y(n) += gN(n) * vN */
+ acc += ((q63_t) gnext * *pv++);
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numStages - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Process sample for 2nd, 6th ...taps */
+ /* Read gN-2(n-1) from state buffer */
+ gcurr = *px1++;
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* y(n) += gN-1(n) * vN-1 */
+ /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Process sample for 3nd, 7th ...taps */
+ /* Read gN-3(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 3rd, 7th .. taps */
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
+ /* y(n) += gN-2(n) * vN-2 */
+ /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-2(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Process sample for 4th, 8th ...taps */
+ /* Read gN-4(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 4th, 8th .. taps */
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+ /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* y(n) += gN-3(n) * vN-3 */
+ /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-3(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Process sample for 5th, 9th ...taps */
+ /* Read gN-5(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 5th, 9th .. taps */
+ /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+ /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
+ /* y(n) += gN-4(n) * vN-4 */
+ /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+ acc += ((q63_t) gnext * *pv++);
+
+ /* write gN-4(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ fnext = fcurr;
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numStages - 1U) % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk )) >> 31));
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+
+ /* Output samples for last taps */
+ acc += ((q63_t) gnext * *pv++);
+ *px2++ = gnext;
+ fcurr = fnext;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += ((q63_t) fnext * *pv++);
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = (q31_t) (acc >> 31U);
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCur = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ /* Copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numStages >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numStages % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ tapCnt = (numStages - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* Sample processing */
+ while (blkCnt > 0U)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
+ tapCnt = numStages;
+
+ while (tapCnt > 0U)
+ {
+ gcurr = *px1++;
+ /* Process sample */
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = clip_q63_to_q31(((q63_t) fcurr - ((q31_t) (((q63_t) gcurr * (*pk )) >> 31))));
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = clip_q63_to_q31(((q63_t) gcurr + ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+
+ /* Output samples */
+ /* y(n) += gN(n) * vN */
+ acc += ((q63_t) gnext * *pv++);
+
+ /* write gN-1(n-1) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += ((q63_t) fnext * *pv++);
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = (q31_t) (acc >> 31U);
+
+ /* Advance the state pointer by 1 to process the next group of samples */
+ pState = pState + 1U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCur = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages;
+
+ /* Copy data */
+ while (tapCnt > 0U)
+ {
+ *pStateCur++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/**
+ @} end of IIR_Lattice group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c
new file mode 100644
index 0000000000..420c84224d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f16.c
@@ -0,0 +1,276 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_levinson_durbin_f16.c
+ * Description: f16 version of Levinson Durbin algorithm
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions_f16.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+ @addtogroup LD
+ @{
+ */
+
+/**
+ @brief Levinson Durbin
+ @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+ @param[out] a autoregressive coefficients
+ @param[out] err prediction error (variance)
+ @param[in] nbCoefs number of autoregressive coefficients
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_f16 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define LANE4567_MASK 0xFF00
+
+void arm_levinson_durbin_f16(const float16_t *phi,
+ float16_t *a,
+ float16_t *err,
+ int nbCoefs)
+{
+ _Float16 e;
+ static const uint16_t revOffsetArray[8] = {7,6,5,4,3,2,1,0};
+
+ a[0] = (_Float16)phi[1] / (_Float16)phi[0];
+
+ e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
+ for(int p=1; p < nbCoefs; p++)
+ {
+ _Float16 suma = 0.0f16;
+ _Float16 sumb = 0.0f16;
+ f16x8_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB;
+ _Float16 k;
+ uint32_t blkCnt;
+ const float16_t *pPhi,*pRevPhi,*pA;
+ uint16x8_t revOffset;
+
+ int nb,j,i;
+
+ revOffset = vld1q(revOffsetArray);
+ vecSumA = vdupq_n_f16(0.0f16);
+ vecSumB = vdupq_n_f16(0.0f16);
+
+ pRevPhi = &phi[p-7];
+ pPhi = &phi[1];
+ pA = a;
+
+ i = 0;
+ blkCnt = p >> 3;
+ while(blkCnt > 0)
+ {
+ vecA = vld1q(pA);
+ pA += 8;
+
+ vecPhi = vld1q(pPhi);
+ pPhi += 8;
+
+ vecRevPhi = vldrhq_gather_shifted_offset_f16(pRevPhi,revOffset);
+ pRevPhi -= 8;
+
+ vecSumA = vfmaq(vecSumA,vecA,vecRevPhi);
+ vecSumB = vfmaq(vecSumB,vecA,vecPhi);
+
+ i += 8;
+ blkCnt--;
+
+ }
+
+ suma = vecAddAcrossF16Mve(vecSumA);
+ sumb = vecAddAcrossF16Mve(vecSumB);
+
+ blkCnt = p & 7;
+ while(blkCnt > 0)
+ {
+ suma += (_Float16)a[i] * (_Float16)phi[p - i];
+ sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
+
+ i++;
+ blkCnt--;
+ }
+
+ k = ((_Float16)phi[p+1] - suma)/((_Float16)phi[0] - sumb);
+
+ f16x8_t vecRevA,tmp;
+ static uint16_t orgOffsetArray[8]={0,1,2,3,-1,-2,-3,-4};
+ static const uint16_t offsetIncArray[8]={4,4,4,4,-4,-4,-4,-4};
+
+ uint16x8_t offset,offsetInc,vecTmp;
+
+
+ offset = vld1q(orgOffsetArray);
+ vecTmp = vdupq_n_u16(p);
+
+ offset = vaddq_m_u16(offset,offset,vecTmp,LANE4567_MASK);
+ offsetInc = vld1q(offsetIncArray);
+
+ nb = p >> 3;
+ j=0;
+ for(int i = 0; i < nb ; i++)
+ {
+
+ /*
+ x0=a[j] - k * a[p-1-j];
+ x1=a[j+1] - k * a[p-2-j];
+ x3=a[p-1-j] - k * a[j];
+ x4=a[p-2-j] - k * a[j+1];
+
+ a[j] = x0;
+ a[j+1] = x1;
+ a[p-1-j] = x2;
+ a[p-2-j] = x3;
+ */
+
+ uint64_t tmpa,tmpb;
+ vecA = vldrhq_gather_shifted_offset_f16(a,offset);
+
+
+ tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+ tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+ vecRevA = (f16x8_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+ vecRevA = (f16x8_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+
+
+ tmp = vsubq(vecA,vmulq_n_f16(vecRevA,k));
+ vstrhq_scatter_shifted_offset_f16(a, offset, tmp);
+
+ offset = vaddq(offset,offsetInc);
+
+ j+=4;
+
+ }
+
+ blkCnt = p & 7;
+
+ if (blkCnt)
+ {
+ nb = blkCnt >> 1;
+ for(int i =0;i < nb ; i++)
+ {
+ _Float16 x,y;
+
+ x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
+ y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ j++;
+ }
+
+ nb = blkCnt & 1;
+ if (nb)
+ {
+ a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
+ }
+ }
+
+
+ a[p] = k;
+ e = e * (1.0f16 - k*k);
+
+
+ }
+ *err = e;
+}
+
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_levinson_durbin_f16(const float16_t *phi,
+ float16_t *a,
+ float16_t *err,
+ int nbCoefs)
+{
+ _Float16 e;
+
+ a[0] = (_Float16)phi[1] / (_Float16)phi[0];
+
+ e = (_Float16)phi[0] - (_Float16)phi[1] * (_Float16)a[0];
+ for(int p=1; p < nbCoefs; p++)
+ {
+ _Float16 suma=0.0f16;
+ _Float16 sumb=0.0f16;
+ _Float16 k;
+ int nb,j;
+
+ for(int i=0; i < p; i++)
+ {
+ suma += (_Float16)a[i] * (_Float16)phi[p - i];
+ sumb += (_Float16)a[i] * (_Float16)phi[i + 1];
+ }
+
+ k = ((_Float16)phi[p+1]-suma)/((_Float16)phi[0] - sumb);
+
+
+ nb = p >> 1;
+ j=0;
+ for(int i =0;i < nb ; i++)
+ {
+ _Float16 x,y;
+
+ x=(_Float16)a[j] - (_Float16)k * (_Float16)a[p-1-j];
+ y=(_Float16)a[p-1-j] - (_Float16)k * (_Float16)a[j];
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ j++;
+ }
+
+ nb = p & 1;
+ if (nb)
+ {
+ a[j]=(_Float16)a[j]- (_Float16)k * (_Float16)a[p-1-j];
+ }
+
+ a[p] = k;
+ e = e * (1.0f16 - k*k);
+
+
+ }
+ *err = e;
+}
+#endif /* defined(ARM_FLOAT16_SUPPORTED */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of LD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c
new file mode 100644
index 0000000000..014810e702
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_f32.c
@@ -0,0 +1,278 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_levinson_durbin_f32.c
+ * Description: f32 version of Levinson Durbin algorithm
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+ @addtogroup LD
+ @{
+ */
+
+/**
+ @brief Levinson Durbin
+ @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+ @param[out] a autoregressive coefficients
+ @param[out] err prediction error (variance)
+ @param[in] nbCoefs number of autoregressive coefficients
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_f32 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define LANE23_MASK 0xFF00
+
+void arm_levinson_durbin_f32(const float32_t *phi,
+ float32_t *a,
+ float32_t *err,
+ int nbCoefs)
+{
+ float32_t e;
+ static const uint32_t revOffsetArray[4] = {3,2,1,0};
+
+ a[0] = phi[1] / phi[0];
+
+ e = phi[0] - phi[1] * a[0];
+ for(int p=1; p < nbCoefs; p++)
+ {
+ float32_t suma = 0.0f;
+ float32_t sumb = 0.0f;
+ f32x4_t vecA,vecRevPhi,vecPhi,vecSumA, vecSumB;
+ float32_t k;
+ uint32_t blkCnt;
+ const float32_t *pPhi,*pRevPhi,*pA;
+ uint32x4_t revOffset;
+
+ int nb,j,i;
+
+ revOffset = vld1q(revOffsetArray);
+ vecSumA = vdupq_n_f32(0.0f);
+ vecSumB = vdupq_n_f32(0.0f);
+
+ pRevPhi = &phi[p-3];
+ pPhi = &phi[1];
+ pA = a;
+
+ i = 0;
+ blkCnt = p >> 2;
+ while(blkCnt > 0)
+ {
+ vecA = vld1q(pA);
+ pA += 4;
+
+ vecPhi = vld1q(pPhi);
+ pPhi += 4;
+
+ vecRevPhi = vldrwq_gather_shifted_offset_f32(pRevPhi,revOffset);
+ pRevPhi -= 4;
+
+ vecSumA = vfmaq(vecSumA,vecA,vecRevPhi);
+ vecSumB = vfmaq(vecSumB,vecA,vecPhi);
+
+ i += 4;
+ blkCnt--;
+
+ }
+
+ suma = vecAddAcrossF32Mve(vecSumA);
+ sumb = vecAddAcrossF32Mve(vecSumB);
+
+ blkCnt = p & 3;
+ while(blkCnt > 0)
+ {
+ suma += a[i] * phi[p - i];
+ sumb += a[i] * phi[i + 1];
+
+ i++;
+ blkCnt--;
+ }
+
+ k = (phi[p+1] - suma)/(phi[0] - sumb);
+
+ f32x4_t vecRevA,tmp;
+ static uint32_t orgOffsetArray[4]={0,1,-1,-2};
+ static const uint32_t offsetIncArray[4]={2,2,-2,-2};
+
+ uint32x4_t offset,offsetInc,vecTmp;
+
+
+ offset = vld1q(orgOffsetArray);
+ vecTmp = vdupq_n_u32(p);
+
+ offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
+ offsetInc = vld1q(offsetIncArray);
+
+ nb = p >> 2;
+ j=0;
+ for(int i = 0; i < nb ; i++)
+ {
+
+ /*
+ x0=a[j] - k * a[p-1-j];
+ x1=a[j+1] - k * a[p-2-j];
+ x3=a[p-1-j] - k * a[j];
+ x4=a[p-2-j] - k * a[j+1];
+
+ a[j] = x0;
+ a[j+1] = x1;
+ a[p-1-j] = x2;
+ a[p-2-j] = x3;
+ */
+
+ uint64_t tmpa,tmpb;
+ vecA = vldrwq_gather_shifted_offset_f32(a,offset);
+
+
+ tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+ tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+ vecRevA = (f32x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+ vecRevA = (f32x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+
+
+ tmp = vsubq(vecA,vmulq_n_f32(vecRevA,k));
+ vstrwq_scatter_shifted_offset_f32(a, offset, tmp);
+
+ offset = vaddq(offset,offsetInc);
+
+ j+=2;
+
+ }
+
+ switch(p & 3)
+ {
+ case 3:
+ {
+ float32_t x,y;
+ x = a[j] - k * a[p-1-j];
+ y = a[p-1-j] - k * a[j];
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ a[j+1] = a[j+1] - k * a[p-1-(j+1)];
+ }
+ break;
+
+ case 2:
+ {
+ float32_t x,y;
+ x = a[j] - k * a[p-1-j];
+ y = a[p-1-j] - k * a[j];
+
+ a[j] = x;
+ a[p-1-j] = y;
+ }
+ break;
+
+ case 1:
+ a[j] = a[j]- k * a[p-1-j];
+ break;
+ }
+
+ a[p] = k;
+ e = e * (1.0f - k*k);
+
+
+ }
+ *err = e;
+}
+
+#else
+void arm_levinson_durbin_f32(const float32_t *phi,
+ float32_t *a,
+ float32_t *err,
+ int nbCoefs)
+{
+ float32_t e;
+
+ a[0] = phi[1] / phi[0];
+
+ e = phi[0] - phi[1] * a[0];
+ for(int p=1; p < nbCoefs; p++)
+ {
+ float32_t suma=0.0f;
+ float32_t sumb=0.0f;
+ float32_t k;
+ int nb,j;
+
+ for(int i=0; i < p; i++)
+ {
+ suma += a[i] * phi[p - i];
+ sumb += a[i] * phi[i + 1];
+ }
+
+ k = (phi[p+1]-suma)/(phi[0] - sumb);
+
+
+ nb = p >> 1;
+ j=0;
+ for(int i =0; i < nb ; i++)
+ {
+ float32_t x,y;
+
+ x=a[j] - k * a[p-1-j];
+ y=a[p-1-j] - k * a[j];
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ j++;
+ }
+
+ nb = p & 1;
+ if (nb)
+ {
+ a[j]=a[j]- k * a[p-1-j];
+ }
+
+ a[p] = k;
+ e = e * (1.0f - k*k);
+
+
+ }
+ *err = e;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of LD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c
new file mode 100644
index 0000000000..6384c555b7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_levinson_durbin_q31.c
@@ -0,0 +1,378 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_levinson_durbin_q31.c
+ * Description: q31 version of Levinson Durbin algorithm
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+#define ONE_Q31 0x7FFFFFFFL
+#define TWO_Q30 0x7FFFFFFFL
+
+#define HALF_Q31 0x00008000L
+#define ONE_Q15 0x7FFF
+#define HALF_Q15 0x3FFF
+#define LOWPART_MASK 0x07FFF
+
+__STATIC_FORCEINLINE q31_t mul32x16(q31_t a, q15_t b)
+{
+ q31_t r = ((q63_t)a * (q63_t)b) >> 15;
+
+ return(r);
+
+}
+
+__STATIC_FORCEINLINE q31_t mul32x32(q31_t a, q31_t b)
+{
+ //q31_t r = __SSAT(((q63_t)a * b) >> 31,31);
+ q31_t r = ((q63_t)a * b) >> 31;
+
+ return(r);
+
+}
+
+__STATIC_FORCEINLINE q31_t divide(q31_t n, q31_t d)
+{
+ arm_status status;
+ int16_t shift;
+ q15_t inverse;
+ q31_t r;
+ // We are computing:
+ // n / d = n / (h + l) where h and l are the high end and low end part.
+ // 1 / (h + l) = 1 / h (1 - l / h)
+ // Our division algorithm has a shift. So it is returning a scaled value sh.
+ // So we need a << shift to convert 1/ sh to 1/h.
+ // In below code, we are organizing the computation differently. Instead of computing:
+ // 1 / h (1 - l / h)
+ // we are computing
+ // 1 / h (2 - (l + h) / h)
+ // 1 / h (2 - d / h)
+ // Also, we are not computing 1/h in Q15 but in Q14.
+ // 2 is expressed in Q30.
+ // So at the end of all computation we need a << 2
+
+ // Result is in Q14 because of use of HALF_Q15 instead of ONE_Q15.
+ status=arm_divide_q15(HALF_Q15,d>>16,&inverse,&shift);
+ (void)status;
+
+ // d is used instead of l
+ // So we will need to substract to 2 instead of 1.
+ r = mul32x16(d,inverse);
+ r = TWO_Q30 - (r << shift);
+ r = mul32x16(r, inverse);
+ r = mul32x32(r,n) ;
+ r = r << (shift + 2);
+
+ return(r);
+
+}
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup LD Levinson Durbin Algorithm
+
+ */
+
+/**
+ @addtogroup LD
+ @{
+ */
+
+/**
+ @brief Levinson Durbin
+ @param[in] phi autocovariance vector starting with lag 0 (length is nbCoefs + 1)
+ @param[out] a autoregressive coefficients
+ @param[out] err prediction error (variance)
+ @param[in] nbCoefs number of autoregressive coefficients
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_levinson_durbin_q31 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+#define LANE23_MASK 0xFF00
+
+#include "arm_helium_utils.h"
+void arm_levinson_durbin_q31(const q31_t *phi,
+ q31_t *a,
+ q31_t *err,
+ int nbCoefs)
+{
+ q31_t e;
+
+ static const uint32_t revOffsetArray[4] = {3,2,1,0};
+
+ //a[0] = phi[1] / phi[0];
+ a[0] = divide(phi[1], phi[0]);
+
+
+ //e = phi[0] - phi[1] * a[0];
+ e = phi[0] - mul32x32(phi[1],a[0]);
+
+ for(int p=1; p < nbCoefs; p++)
+ {
+ q63_t suma=0;
+ q63_t sumb=0;
+ q31x4_t vecA,vecRevPhi,vecPhi;
+ q31_t k;
+ uint32_t blkCnt;
+ const q31_t *pPhi,*pRevPhi,*pA;
+ uint32x4_t revOffset;
+
+
+ int nb,j,i;
+
+ revOffset = vld1q(revOffsetArray);
+
+ pRevPhi = &phi[p-3];
+ pPhi = &phi[1];
+ pA = a;
+
+ i = 0;
+ blkCnt = p >> 2;
+ while(blkCnt > 0)
+ {
+ vecA = vld1q(pA);
+ pA += 4;
+
+ vecPhi = vld1q(pPhi);
+ pPhi += 4;
+
+ vecRevPhi = vldrwq_gather_shifted_offset_s32(pRevPhi,revOffset);
+ pRevPhi -= 4;
+
+ suma = vmlaldavaq(suma,vecA,vecRevPhi);
+ sumb = vmlaldavaq(sumb,vecA,vecPhi);
+
+ i += 4;
+ blkCnt--;
+ }
+
+
+ blkCnt = p & 3;
+ while(blkCnt > 0)
+ {
+ suma += ((q63_t)a[i] * phi[p - i]);
+ sumb += ((q63_t)a[i] * phi[i + 1]);
+
+ i++;
+ blkCnt--;
+ }
+
+ suma = asrl(suma, 31);
+ sumb = asrl(sumb, 31);
+
+
+
+ //k = (phi[p+1]-suma)/(phi[0] - sumb);
+ k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
+
+ q31x4_t vecRevA,tmp;
+ static uint32_t orgOffsetArray[4]={0,1,-1,-2};
+ static const uint32_t offsetIncArray[4]={2,2,-2,-2};
+
+ uint32x4_t offset,offsetInc,vecTmp;
+
+
+ offset = vld1q(orgOffsetArray);
+ vecTmp = vdupq_n_u32(p);
+
+ offset = vaddq_m_u32(offset,offset,vecTmp,LANE23_MASK);
+ offsetInc = vld1q(offsetIncArray);
+
+
+ nb = p >> 2;
+ j=0;
+ for(int i =0;i < nb ; i++)
+ {
+ /*
+ q31_t x0,x1,x2,x3;
+
+ //x = a[j] - k * a[p-1-j];
+ x0 = a[j] - mul32x32(k,a[p-1-j]);
+ x1 = a[j+1] - mul32x32(k,a[p-2-j]);
+
+ //y = a[p-1-j] - k * a[j];
+ x2 = a[p-1-j] - mul32x32(k , a[j]);
+ x3 = a[p-2-j] - mul32x32(k , a[j+1]);
+
+ a[j] = x0;
+ a[j+1] = x1;
+ a[p-1-j] = x2;
+ a[p-2-j] = x3;
+ */
+
+ uint64_t tmpa,tmpb;
+ vecA = vldrwq_gather_shifted_offset_s32(a,offset);
+
+
+ tmpa = vgetq_lane_u64((uint64x2_t)vecA,0);
+ tmpb = vgetq_lane_u64((uint64x2_t)vecA,1);
+ vecRevA = (q31x4_t) vsetq_lane_u64(tmpb,(uint64x2_t)vecRevA,0);
+ vecRevA = (q31x4_t) vsetq_lane_u64(tmpa,(uint64x2_t)vecRevA,1);
+
+
+ tmp = vsubq(vecA,vqdmulhq_n_s32(vecRevA,k));
+ vstrwq_scatter_shifted_offset_s32(a, offset, tmp);
+
+ offset = vaddq(offset,offsetInc);
+
+ j+=2;
+ }
+
+ switch(p & 3)
+ {
+ case 3:
+ {
+ q31_t x,y;
+
+ //x = a[j] - k * a[p-1-j];
+ x = a[j] - mul32x32(k,a[p-1-j]);
+
+ //y = a[p-1-j] - k * a[j];
+ y = a[p-1-j] - mul32x32(k , a[j]);
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ //a[j] = a[j]- k * a[p-1-j];
+ a[j+1] = a[j+1] - mul32x32(k,a[p-2-j]);
+ }
+ break;
+
+ case 2:
+ {
+ q31_t x,y;
+
+ //x = a[j] - k * a[p-1-j];
+ x = a[j] - mul32x32(k,a[p-1-j]);
+
+ //y = a[p-1-j] - k * a[j];
+ y = a[p-1-j] - mul32x32(k , a[j]);
+
+ a[j] = x;
+ a[p-1-j] = y;
+ }
+ break;
+
+ case 1:
+ //a[j] = a[j]- k * a[p-1-j];
+ a[j] = a[j] - mul32x32(k,a[p-1-j]);
+ break;
+ }
+
+ a[p] = k;
+
+ // e = e * (1 - k*k);
+ e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
+
+
+ }
+ *err = e;
+}
+
+#else
+
+void arm_levinson_durbin_q31(const q31_t *phi,
+ q31_t *a,
+ q31_t *err,
+ int nbCoefs)
+{
+ q31_t e;
+
+ //a[0] = phi[1] / phi[0];
+ a[0] = divide(phi[1], phi[0]);
+
+
+ //e = phi[0] - phi[1] * a[0];
+ e = phi[0] - mul32x32(phi[1],a[0]);
+
+ for(int p=1; p < nbCoefs; p++)
+ {
+ q63_t suma=0;
+ q63_t sumb=0;
+ q31_t k;
+ int nb,j;
+
+ for(int i=0; i < p; i++)
+ {
+ suma += ((q63_t)a[i] * phi[p - i]);
+ sumb += ((q63_t)a[i] * phi[i + 1]);
+ }
+
+ suma = suma >> 31;
+ sumb = sumb >> 31;
+
+
+
+ //k = (phi[p+1]-suma)/(phi[0] - sumb);
+ k = divide(phi[p+1]-(q31_t)suma,phi[0] - (q31_t)sumb);
+
+
+ nb = p >> 1;
+ j=0;
+ for(int i =0;i < nb ; i++)
+ {
+ q31_t x,y;
+
+ //x = a[j] - k * a[p-1-j];
+ x = a[j] - mul32x32(k,a[p-1-j]);
+
+ //y = a[p-1-j] - k * a[j];
+ y = a[p-1-j] - mul32x32(k , a[j]);
+
+ a[j] = x;
+ a[p-1-j] = y;
+
+ j++;
+ }
+
+ nb = p & 1;
+ if (nb)
+ {
+ //a[j] = a[j]- k * a[p-1-j];
+ a[j] = a[j] - mul32x32(k,a[p-1-j]);
+ }
+
+ a[p] = k;
+
+ // e = e * (1 - k*k);
+ e = mul32x32(e,ONE_Q31 - mul32x32(k,k));
+
+
+ }
+ *err = e;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of LD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c
new file mode 100644
index 0000000000..ae6f01b8e1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_f32.c
@@ -0,0 +1,533 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_f32.c
+ * Description: Processing function for the floating-point LMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup LMS Least Mean Square (LMS) Filters
+
+ LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
+ LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
+ Adaptive filters are often used in communication systems, equalizers, and noise removal.
+ The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
+ The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
+
+ An LMS filter consists of two components as shown below.
+ The first component is a standard transversal or FIR filter.
+ The second component is a coefficient update mechanism.
+ The LMS filter has two input signals.
+ The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ This "error signal" tends towards zero as the filter adapts.
+ The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ \image html LMS.gif "Internal structure of the Least Mean Square filter"
+
+ The functions operate on blocks of data and each call to the function processes
+ blockSize samples through the filter.
+ pSrc points to input signal, pRef points to reference signal,
+ pOut points to output signal and pErr points to error signal.
+ All arrays contain blockSize values.
+
+ The functions operate on a block-by-block basis.
+ Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+
+ @par Algorithm
+ The output signal y[n] is computed by a standard FIR filter:
+
+ y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+
+
+ @par
+ The error signal equals the difference between the reference signal d[n] and the filter output:
+
+ e[n] = d[n] - y[n].
+
+
+ @par
+ After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis:
+
+ b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1
+
+ where mu is the step size and controls the rate of coefficient convergence.
+ @par
+ In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to a state array of size numTaps + blockSize - 1.
+ Samples in the state buffer are stored in the order:
+ @par
+
+ {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed.
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter and
+ coefficient and state arrays cannot be shared among instances.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, mu, postShift (not for f32), pState. Also set all of the values in pState to zero.
+
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Set the values in the state buffer to zeros before static initialization.
+ The code below statically initializes each of the 3 different data type filter instance structures
+
+ arm_lms_instance_f32 S = {numTaps, pState, pCoeffs, mu};
+ arm_lms_instance_q31 S = {numTaps, pState, pCoeffs, mu, postShift};
+ arm_lms_instance_q15 S = {numTaps, pState, pCoeffs, mu, postShift};
+
+ where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the Q15 and Q31 versions of the LMS filter.
+ The following issues must be considered:
+ - Scaling of coefficients
+ - Overflow and saturation
+
+ @par Scaling of Coefficients
+ Filter coefficients are represented as fractional values and
+ coefficients are restricted to lie in the range [-1 +1).
+ The fixed-point functions have an additional scaling parameter postShift.
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ This essentially scales the filter coefficients by 2^postShift and
+ allows the filter coefficients to exceed the range [+1 -1).
+ The value of postShift is set by the user based on the expected gain through the system being modeled.
+
+ @par Overflow and Saturation
+ Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ described separately as part of the function specific documentation below.
+ */
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+#if defined(ARM_MATH_NEON)
+void arm_lms_f32(
+ const arm_lms_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t sum, e, d; /* accumulator, error, reference data sample */
+ float32_t w = 0.0f; /* weight factor */
+
+ float32x4_t tempV, sumV, xV, bV;
+ float32x2_t tempV2;
+
+ e = 0.0f;
+ d = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+ sumV = vdupq_n_f32(0.0);
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ sumV = vmlaq_f32(sumV, xV, bV);
+
+ px += 4;
+ pb += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+ tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
+
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result in the accumulator, store in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for the updating filter coefficients */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ px += 4;
+ bV = vmlaq_n_f32(bV,xV,w);
+
+ vst1q_f32(pb,bV);
+ pb += 4;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Process 4 taps at a time for (numTaps - 1U) samples copy */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ tempV = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,tempV);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+ /* Copy the remaining q31_t data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+}
+#else
+void arm_lms_f32(
+ const arm_lms_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
+
+ /* Compute and store error */
+ e = (float32_t) *pRef++ - acc;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
new file mode 100644
index 0000000000..cf472afde0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_f32.c
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_init_f32.c
+ * Description: Floating-point LMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Initialization function for floating-point LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_lms_f32().
+ */
+
+void arm_lms_init_f32(
+ arm_lms_instance_f32 * S,
+ uint16_t numTaps,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ float32_t mu,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps */
+ memset(pState, 0, (numTaps + (blockSize - 1)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+}
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
new file mode 100644
index 0000000000..31c2bff244
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q15.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_init_q15.c
+ * Description: Q15 LMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 LMS filter.
+ @param[in] S points to an instance of the Q15 LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to the array of state variables and size of array is
+ numTaps+blockSize-1 samples, where blockSize is the number of
+ input samples processed by each call to arm_lms_q15().
+ */
+
+void arm_lms_init_q15(
+ arm_lms_instance_q15 * S,
+ uint16_t numTaps,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ q15_t mu,
+ uint32_t blockSize,
+ uint32_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Assign postShift value to be applied */
+ S->postShift = postShift;
+}
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
new file mode 100644
index 0000000000..e657a93824
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_init_q31.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_init_q31.c
+ * Description: Q31 LMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Initialization function for Q31 LMS filter.
+ @param[in] S points to an instance of the Q31 LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @param[in] postShift bit shift applied to coefficients
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples,
+ where blockSize is the number of input samples processed by each call to
+ arm_lms_q31().
+ */
+
+void arm_lms_init_q31(
+ arm_lms_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ q31_t mu,
+ uint32_t blockSize,
+ uint32_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Assign postShift value to be applied */
+ S->postShift = postShift;
+}
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
new file mode 100644
index 0000000000..ebee5e9a54
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -0,0 +1,564 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_f32.c
+ * Description: Processing function for the floating-point NLMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @defgroup LMS_NORM Normalized LMS Filters
+
+ This set of functions implements a commonly used adaptive filter.
+ It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
+ factor which increases the adaptation rate of the filter.
+ The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
+
+ A normalized least mean square (NLMS) filter consists of two components as shown below.
+ The first component is a standard transversal or FIR filter.
+ The second component is a coefficient update mechanism.
+ The NLMS filter has two input signals.
+ The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ This "error signal" tends towards zero as the filter adapts.
+ The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ \image html LMS.gif "Internal structure of the NLMS adaptive filter"
+
+ The functions operate on blocks of data and each call to the function processes
+ blockSize samples through the filter.
+ pSrc points to input signal, pRef points to reference signal,
+ pOut points to output signal and pErr points to error signal.
+ All arrays contain blockSize values.
+
+ The functions operate on a block-by-block basis.
+ Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+
+ @par Algorithm
+ The output signal y[n] is computed by a standard FIR filter:
+
+ y[n] = b[0] * x[n] + b[1] * x[n-1] + b[2] * x[n-2] + ...+ b[numTaps-1] * x[n-numTaps+1]
+
+
+ @par
+ The error signal equals the difference between the reference signal d[n] and the filter output:
+
+ e[n] = d[n] - y[n].
+
+
+ @par
+ After each sample of the error signal is computed the instanteous energy of the filter state variables is calculated:
+
+ E = x[n]^2 + x[n-1]^2 + ... + x[n-numTaps+1]^2.
+
+ The filter coefficients b[k] are then updated on a sample-by-sample basis:
+
+ b[k] = b[k] + e[n] * (mu/E) * x[n-k], for k=0, 1, ..., numTaps-1
+
+ where mu is the step size and controls the rate of coefficient convergence.
+ @par
+ In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ Coefficients are stored in time reversed order.
+ @par
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ @par
+ pState points to a state array of size numTaps + blockSize - 1.
+ Samples in the state buffer are stored in the order:
+ @par
+
+ {x[n-numTaps+1], x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2]....x[0], x[1], ..., x[blockSize-1]}
+
+ @par
+ Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ to be avoided and yields a significant speed improvement.
+ The state variables are updated after each block of data is processed.
+
+ @par Instance Structure
+ The coefficients and state variables for a filter are stored together in an instance data structure.
+ A separate instance structure must be defined for each filter and
+ coefficient and state arrays cannot be shared among instances.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Zeros out the values in the state buffer.
+ To do this manually without calling the init function, assign the follow subfields of the instance structure:
+ numTaps, pCoeffs, mu, energy, x0, pState. Also set all of the values in pState to zero.
+ For Q7, Q15, and Q31 the following fields must also be initialized;
+ recipTable, postShift
+ @par
+ Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+ @par Fixed-Point Behavior
+ Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
+ The following issues must be considered:
+ - Scaling of coefficients
+ - Overflow and saturation
+
+ @par Scaling of Coefficients (fixed point versions)
+ Filter coefficients are represented as fractional values and
+ coefficients are restricted to lie in the range [-1 +1).
+ The fixed-point functions have an additional scaling parameter postShift.
+ At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ This essentially scales the filter coefficients by 2^postShift and
+ allows the filter coefficients to exceed the range [+1 -1).
+ The value of postShift is set by the user based on the expected gain through the system being modeled.
+
+ @par Overflow and Saturation (fixed point versions)
+ Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ described separately as part of the function specific documentation below.
+ */
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Processing function for floating-point normalized LMS filter.
+ @param[in] S points to an instance of the floating-point normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+ */
+
+#if defined(ARM_MATH_NEON)
+void arm_lms_norm_f32(
+ arm_lms_norm_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t energy; /* Energy of the input */
+ float32_t sum, e, d; /* accumulator, error, reference data sample */
+ float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */
+
+ float32x4_t tempV, sumV, xV, bV;
+ float32x2_t tempV2;
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ d = 0.0f;
+ w = 0.0f;
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= x0 * x0;
+ energy += in * in;
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+ sumV = vdupq_n_f32(0.0);
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ sumV = vmlaq_f32(sumV, xV, bV);
+
+ px += 4;
+ pb += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+ tempV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(tempV2, 0) + vget_lane_f32(tempV2, 1);
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result in the accumulator, store in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ /* epsilon value 0.000000119209289f */
+ w = (e * mu) / (energy + 0.000000119209289f);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ xV = vld1q_f32(px);
+ bV = vld1q_f32(pb);
+ px += 4;
+ bV = vmlaq_n_f32(bV,xV,w);
+
+ vst1q_f32(pb,bV);
+ pb += 4;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ S->energy = energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Process 4 taps at a time for (numTaps - 1U)/4 samples copy */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ /* copy data */
+ while (tapCnt > 0U)
+ {
+ tempV = vld1q_f32(pState);
+ vst1q_f32(pStateCurnt,tempV);
+ pState += 4;
+ pStateCurnt += 4;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+ /* Copy the remaining q31_t data */
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+#else
+void arm_lms_norm_f32(
+ arm_lms_norm_instance_f32 * S,
+ const float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ float32_t acc, e; /* Accumulator, error */
+ float32_t w; /* Weight factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t energy; /* Energy of the input */
+ float32_t x0, in; /* Temporary variable to hold input sample and state */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ w = 0.0f;
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= x0 * x0;
+ energy += in * in;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ acc += (*px++) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = acc;
+
+ /* Compute and store error */
+ e = (float32_t) *pRef++ - acc;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ /* epsilon value 0.000000119209289f */
+ w = (e * mu) / (energy + 0.000000119209289f);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = energy;
+ S->x0 = x0;
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
new file mode 100644
index 0000000000..b0b7ea505b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_init_f32.c
+ * Description: Floating-point NLMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Initialization function for floating-point normalized LMS filter.
+ @param[in] S points to an instance of the floating-point LMS filter structure
+ @param[in] numTaps number of filter coefficients
+ @param[in] pCoeffs points to coefficient buffer
+ @param[in] pState points to state buffer
+ @param[in] mu step size that controls filter coefficient updates
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples,
+ where blockSize is the number of input samples processed by each call to arm_lms_norm_f32().
+ */
+
+void arm_lms_norm_init_f32(
+ arm_lms_norm_instance_f32 * S,
+ uint16_t numTaps,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ float32_t mu,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Initialise Energy to zero */
+ S->energy = 0.0f;
+
+ /* Initialise x0 to zero */
+ S->x0 = 0.0f;
+}
+
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c
new file mode 100644
index 0000000000..5bbfc42052
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q15.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_init_q15.c
+ * Description: Q15 NLMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Initialization function for Q15 normalized LMS filter.
+ @param[in] S points to an instance of the Q15 normalized LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to the array of state variables and size of array is
+ numTaps+blockSize-1 samples, where blockSize is the number of input samples processed
+ by each call to arm_lms_norm_q15().
+ */
+
+void arm_lms_norm_init_q15(
+ arm_lms_norm_instance_q15 * S,
+ uint16_t numTaps,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ q15_t mu,
+ uint32_t blockSize,
+ uint8_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q15_t));
+
+ /* Assign post Shift value applied to coefficients */
+ S->postShift = postShift;
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Initialize reciprocal pointer table */
+ S->recipTable = (q15_t *) armRecipTableQ15;
+
+ /* Initialise Energy to zero */
+ S->energy = 0;
+
+ /* Initialise x0 to zero */
+ S->x0 = 0;
+}
+
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c
new file mode 100644
index 0000000000..cc429b0097
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_init_q31.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_init_q31.c
+ * Description: Q31 NLMS filter initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Initialization function for Q31 normalized LMS filter.
+ @param[in] S points to an instance of the Q31 normalized LMS filter structure.
+ @param[in] numTaps number of filter coefficients.
+ @param[in] pCoeffs points to coefficient buffer.
+ @param[in] pState points to state buffer.
+ @param[in] mu step size that controls filter coefficient updates.
+ @param[in] blockSize number of samples to process.
+ @param[in] postShift bit shift applied to coefficients.
+ @return none
+
+ @par Details
+ pCoeffs points to the array of filter coefficients stored in time reversed order:
+
+ {b[numTaps-1], b[numTaps-2], b[N-2], ..., b[1], b[0]}
+
+ The initial filter coefficients serve as a starting point for the adaptive filter.
+ pState points to an array of length numTaps+blockSize-1 samples,
+ where blockSize is the number of input samples processed by each call to arm_lms_norm_q31().
+ */
+
+void arm_lms_norm_init_q31(
+ arm_lms_norm_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ q31_t mu,
+ uint32_t blockSize,
+ uint8_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1U)) * sizeof(q31_t));
+
+ /* Assign post Shift value applied to coefficients */
+ S->postShift = postShift;
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Initialize reciprocal pointer table */
+ S->recipTable = (q31_t *) armRecipTableQ31;
+
+ /* Initialise Energy to zero */
+ S->energy = 0;
+
+ /* Initialise x0 to zero */
+ S->x0 = 0;
+}
+
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
new file mode 100644
index 0000000000..8d7a0cdd88
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q15.c
@@ -0,0 +1,297 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_q15.c
+ * Description: Processing function for Q15 normalized LMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Processing function for Q15 normalized LMS filter.
+ @param[in] S points to an instance of the Q15 normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ Both coefficients and state variables are represented in 1.15 format and
+ multiplications yield a 2.30 result. The 2.30 intermediate results are
+ accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full
+ precision of intermediate multiplications is preserved. After all additions
+ have been performed, the accumulator is truncated to 34.15 format by
+ discarding low 15 bits. Lastly, the accumulator is saturated to yield a
+ result in 1.15 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and the
+ updation of filter cofficients are saturted.
+ */
+
+void arm_lms_norm_q15(
+ arm_lms_norm_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q15_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q31_t energy; /* Energy of the input */
+ q15_t e = 0, d = 0; /* Error, reference data sample */
+ q15_t w = 0, in; /* Weight factor and state */
+ q15_t x0; /* Temporary variable to hold input sample */
+ q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Temporary variable for coefficient */
+ q31_t acc_l, acc_h; /* Temporary input */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= (((q31_t) x0 * (x0)) >> 15);
+ energy += (((q31_t) in * (in)) >> 15);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16U);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ d = *pRef++;
+ e = d - (q15_t) acc;
+ *pErr++ = e;
+
+ /* Calculation of 1/energy */
+ postShift = arm_recip_q15((q15_t) energy + DELTA_Q15, &oneByEnergy, S->recipTable);
+
+ /* Calculation of e * mu value */
+ errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
+
+ /* Calculation of (e * mu) * (1/energy) value */
+ acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
+
+ /* Weighting factor for the normalized version */
+ w = (q15_t) __SSAT((q31_t) acc, 16);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT(coef, 16);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q15_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
new file mode 100644
index 0000000000..d5d78a9208
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_norm_q31.c
@@ -0,0 +1,311 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_norm_q31.c
+ * Description: Processing function for the Q31 NLMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS_NORM
+ @{
+ */
+
+/**
+ @brief Processing function for Q31 normalized LMS filter.
+ @param[in] S points to an instance of the Q31 normalized LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clip.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(numTaps) bits. The reference signal should not be scaled down.
+ After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+ and saturated to 1.31 format to yield the final result.
+ The output signal and error signal are in 1.31 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and the
+ updation of filter cofficients are saturted.
+ */
+
+void arm_lms_norm_q31(
+ arm_lms_norm_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q31_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q63_t energy; /* Energy of the input */
+ q31_t e = 0; /* Error data sample */
+ q31_t w = 0, in; /* Weight factor and state */
+ q31_t x0; /* Temporary variable to hold input sample */
+ q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* Temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy = (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32);
+ energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* acc += b[N] * x[n-N] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-1] * x[n-N-1] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-2] * x[n-N-2] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-3] * x[n-N-3] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q31_t) acc;
+ *pErr++ = e;
+
+ /* Calculates the reciprocal of energy */
+ postShift = arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]);
+
+ /* Calculation of product of (e * mu) */
+ errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Weighting factor for the normalized version */
+ w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+
+ /* coef is in 2.30 format */
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ /* get coef in 1.31 format by left shifting */
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ /* update coefficient buffer to next coefficient */
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Read the sample from state buffer */
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q31_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of LMS_NORM group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c
new file mode 100644
index 0000000000..76cab0320a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q15.c
@@ -0,0 +1,262 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_q15.c
+ * Description: Processing function for Q15 LMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Processing function for Q15 LMS filter.
+ @param[in] S points to an instance of the Q15 LMS filter structure
+ @param[in] pSrc points to the block of input data
+ @param[in] pRef points to the block of reference data
+ @param[out] pOut points to the block of output data
+ @param[out] pErr points to the block of error data
+ @param[in] blockSize number of samples to process
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and
+ the updation of filter cofficients are saturted.
+ */
+
+void arm_lms_q15(
+ const arm_lms_instance_q15 * S,
+ const q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q15_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q15_t e = 0; /* Error of data sample */
+ q15_t alpha; /* Intermediate constant for taps update */
+ q31_t coef; /* Temporary variable for coefficient */
+ q31_t acc_l, acc_h; /* Temporary input */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+ acc = __SMLALD(read_q15x2_ia (&px), read_q15x2_ia (&pb), acc);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16U);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q15_t) acc;
+ *pErr++ = (q15_t) e;
+
+ /* Compute alpha i.e. intermediate constant for taps update */
+ alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) *pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+ write_q15x2_ia (&pStateCurnt, read_q15x2_ia (&pState));
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c
new file mode 100644
index 0000000000..8e58405c03
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/FilteringFunctions/arm_lms_q31.c
@@ -0,0 +1,283 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_lms_q31.c
+ * Description: Processing function for the Q31 LMS filter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/filtering_functions.h"
+
+/**
+ @ingroup groupFilters
+ */
+
+/**
+ @addtogroup LMS
+ @{
+ */
+
+/**
+ @brief Processing function for Q31 LMS filter.
+ @param[in] S points to an instance of the Q31 LMS filter structure.
+ @param[in] pSrc points to the block of input data.
+ @param[in] pRef points to the block of reference data.
+ @param[out] pOut points to the block of output data.
+ @param[out] pErr points to the block of error data.
+ @param[in] blockSize number of samples to process.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit.
+ Thus, if the accumulator result overflows it wraps around rather than clips.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(numTaps) bits.
+ The reference signal should not be scaled down.
+ After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+ and saturated to 1.31 format to yield the final result.
+ The output signal and error signal are in 1.31 format.
+ @par
+ In this filter, filter coefficients are updated for each sample and
+ the updation of filter cofficients are saturted.
+ */
+
+void arm_lms_q31(
+ const arm_lms_instance_q31 * S,
+ const q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q31_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q31_t e = 0; /* Error of data sample */
+ q31_t alpha; /* Intermediate constant for taps update */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* Temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1U);
+ uint32_t lShift = 32U - uShift; /* Shift to be applied to the output */
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1U)]);
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ /* acc += b[N] * x[n-N] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-1] * x[n-N-1] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-2] * x[n-N-2] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-3] * x[n-N-3] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q31_t) acc;
+ *pErr++ = e;
+
+ /* Compute alpha i.e. intermediate constant for taps update */
+ alpha = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = numTaps >> 2U;
+
+ /* Update filter coefficients */
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+
+ /* coef is in 2.30 format */
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ /* get coef in 1.31 format by left shifting */
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ /* update coefficient buffer to next coefficient */
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = numTaps % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = numTaps;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1U));
+ pb++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ Now copy the last numTaps - 1 samples to the start of the state buffer.
+ This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy data */
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 taps at a time. */
+ tapCnt = (numTaps - 1U) >> 2U;
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining taps */
+ tapCnt = (numTaps - 1U) % 0x4U;
+
+#else
+
+ /* Initialize tapCnt with number of samples */
+ tapCnt = (numTaps - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (tapCnt > 0U)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ @} end of LMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctions.c
new file mode 100644
index 0000000000..8462395eda
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctions.c
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: InterpolationFunctions.c
+ * Description: Combination of all interpolation function source files.
+ *
+ * $Date: 22. July 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_bilinear_interp_f32.c"
+#include "arm_bilinear_interp_q15.c"
+#include "arm_bilinear_interp_q31.c"
+#include "arm_bilinear_interp_q7.c"
+#include "arm_linear_interp_f32.c"
+#include "arm_linear_interp_q15.c"
+#include "arm_linear_interp_q31.c"
+#include "arm_linear_interp_q7.c"
+#include "arm_spline_interp_f32.c"
+#include "arm_spline_interp_init_f32.c"
+
+
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctionsF16.c
new file mode 100644
index 0000000000..d778de9da0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/InterpolationFunctionsF16.c
@@ -0,0 +1,33 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: InterpolationFunctions.c
+ * Description: Combination of all interpolation function source files.
+ *
+ * $Date: 22. July 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_bilinear_interp_f16.c"
+#include "arm_linear_interp_f16.c"
+
+
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c
new file mode 100644
index 0000000000..519697a735
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f16.c
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bilinear_interp_f16.c
+ * Description: Floating-point bilinear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup BilinearInterpolate Bilinear Interpolation
+ *
+ * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+ * The underlying function f(x, y) is sampled on a regular grid and the interpolation process
+ * determines values between the grid points.
+ * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+ * Bilinear interpolation is often used in image processing to rescale images.
+ * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+ *
+ * Algorithm
+ * \par
+ * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+ * For floating-point, the instance structure is defined as:
+ *
+ * typedef struct
+ * {
+ * uint16_t numRows;
+ * uint16_t numCols;
+ * float16_t *pData;
+ * } arm_bilinear_interp_instance_f16;
+ *
+ *
+ * \par
+ * where numRows specifies the number of rows in the table;
+ * numCols specifies the number of columns in the table;
+ * and pData points to an array of size numRows*numCols values.
+ * The data table pTable is organized in row order and the supplied data values fall on integer indexes.
+ * That is, table element (x,y) is located at pTable[x + y*numCols] where x and y are integers.
+ *
+ * \par
+ * Let (x, y) specify the desired interpolation point. Then define:
+ *
+ * XF = floor(x)
+ * YF = floor(y)
+ *
+ * \par
+ * The interpolated output point is computed as:
+ *
+ * f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+ * + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+ * + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+ * + f(XF+1, YF+1) * (x-XF)*(y-YF)
+ *
+ * Note that the coordinates (x, y) contain integer and fractional components.
+ * The integer components specify which portion of the table to use while the
+ * fractional components control the interpolation processor.
+ *
+ * \par
+ * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+ */
+
+
+ /**
+ * @addtogroup BilinearInterpolate
+ * @{
+ */
+
+
+ /**
+ * @brief Floating-point bilinear interpolation.
+ * @param[in,out] S points to an instance of the interpolation structure.
+ * @param[in] X interpolation coordinate.
+ * @param[in] Y interpolation coordinate.
+ * @return out interpolated value.
+ */
+ float16_t arm_bilinear_interp_f16(
+ const arm_bilinear_interp_instance_f16 * S,
+ float16_t X,
+ float16_t Y)
+ {
+ float16_t out;
+ float16_t f00, f01, f10, f11;
+ float16_t *pData = S->pData;
+ int32_t xIndex, yIndex, index;
+ float16_t xdiff, ydiff;
+ float16_t b1, b2, b3, b4;
+
+ xIndex = (int32_t) X;
+ yIndex = (int32_t) Y;
+
+ /* Care taken for table outside boundary */
+ /* Returns zero output when values are outside table boundary */
+ if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
+ {
+ return (0);
+ }
+
+ /* Calculation of index for two nearest points in X-direction */
+ index = (xIndex ) + (yIndex ) * S->numCols;
+
+
+ /* Read two nearest points in X-direction */
+ f00 = pData[index];
+ f01 = pData[index + 1];
+
+ /* Calculation of index for two nearest points in Y-direction */
+ index = (xIndex ) + (yIndex+1) * S->numCols;
+
+
+ /* Read two nearest points in Y-direction */
+ f10 = pData[index];
+ f11 = pData[index + 1];
+
+ /* Calculation of intermediate values */
+ b1 = f00;
+ b2 = f01 - f00;
+ b3 = f10 - f00;
+ b4 = f00 - f01 - f10 + f11;
+
+ /* Calculation of fractional part in X */
+ xdiff = X - xIndex;
+
+ /* Calculation of fractional part in Y */
+ ydiff = Y - yIndex;
+
+ /* Calculation of bi-linear interpolated output */
+ out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+ /* return to application */
+ return (out);
+ }
+
+ /**
+ * @} end of BilinearInterpolate group
+ */
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c
new file mode 100644
index 0000000000..fc001d72d6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_f32.c
@@ -0,0 +1,161 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bilinear_interp_f32.c
+ * Description: Floating-point bilinear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup BilinearInterpolate Bilinear Interpolation
+ *
+ * Bilinear interpolation is an extension of linear interpolation applied to a two dimensional grid.
+ * The underlying function f(x, y) is sampled on a regular grid and the interpolation process
+ * determines values between the grid points.
+ * Bilinear interpolation is equivalent to two step linear interpolation, first in the x-dimension and then in the y-dimension.
+ * Bilinear interpolation is often used in image processing to rescale images.
+ * The CMSIS DSP library provides bilinear interpolation functions for Q7, Q15, Q31, and floating-point data types.
+ *
+ * Algorithm
+ * \par
+ * The instance structure used by the bilinear interpolation functions describes a two dimensional data table.
+ * For floating-point, the instance structure is defined as:
+ *
+ * typedef struct
+ * {
+ * uint16_t numRows;
+ * uint16_t numCols;
+ * float32_t *pData;
+ * } arm_bilinear_interp_instance_f32;
+ *
+ *
+ * \par
+ * where numRows specifies the number of rows in the table;
+ * numCols specifies the number of columns in the table;
+ * and pData points to an array of size numRows*numCols values.
+ * The data table pTable is organized in row order and the supplied data values fall on integer indexes.
+ * That is, table element (x,y) is located at pTable[x + y*numCols] where x and y are integers.
+ *
+ * \par
+ * Let (x, y) specify the desired interpolation point. Then define:
+ *
+ * XF = floor(x)
+ * YF = floor(y)
+ *
+ * \par
+ * The interpolated output point is computed as:
+ *
+ * f(x, y) = f(XF, YF) * (1-(x-XF)) * (1-(y-YF))
+ * + f(XF+1, YF) * (x-XF)*(1-(y-YF))
+ * + f(XF, YF+1) * (1-(x-XF))*(y-YF)
+ * + f(XF+1, YF+1) * (x-XF)*(y-YF)
+ *
+ * Note that the coordinates (x, y) contain integer and fractional components.
+ * The integer components specify which portion of the table to use while the
+ * fractional components control the interpolation processor.
+ *
+ * \par
+ * if (x,y) are outside of the table boundary, Bilinear interpolation returns zero output.
+ */
+
+
+ /**
+ * @addtogroup BilinearInterpolate
+ * @{
+ */
+
+
+ /**
+ * @brief Floating-point bilinear interpolation.
+ * @param[in,out] S points to an instance of the interpolation structure.
+ * @param[in] X interpolation coordinate.
+ * @param[in] Y interpolation coordinate.
+ * @return out interpolated value.
+ */
+ float32_t arm_bilinear_interp_f32(
+ const arm_bilinear_interp_instance_f32 * S,
+ float32_t X,
+ float32_t Y)
+ {
+ float32_t out;
+ float32_t f00, f01, f10, f11;
+ float32_t *pData = S->pData;
+ int32_t xIndex, yIndex, index;
+ float32_t xdiff, ydiff;
+ float32_t b1, b2, b3, b4;
+
+ xIndex = (int32_t) X;
+ yIndex = (int32_t) Y;
+
+ /* Care taken for table outside boundary */
+ /* Returns zero output when values are outside table boundary */
+ if (xIndex < 0 || xIndex > (S->numCols - 2) || yIndex < 0 || yIndex > (S->numRows - 2))
+ {
+ return (0);
+ }
+
+ /* Calculation of index for two nearest points in X-direction */
+ index = (xIndex ) + (yIndex ) * S->numCols;
+
+
+ /* Read two nearest points in X-direction */
+ f00 = pData[index];
+ f01 = pData[index + 1];
+
+ /* Calculation of index for two nearest points in Y-direction */
+ index = (xIndex ) + (yIndex+1) * S->numCols;
+
+
+ /* Read two nearest points in Y-direction */
+ f10 = pData[index];
+ f11 = pData[index + 1];
+
+ /* Calculation of intermediate values */
+ b1 = f00;
+ b2 = f01 - f00;
+ b3 = f10 - f00;
+ b4 = f00 - f01 - f10 + f11;
+
+ /* Calculation of fractional part in X */
+ xdiff = X - xIndex;
+
+ /* Calculation of fractional part in Y */
+ ydiff = Y - yIndex;
+
+ /* Calculation of bi-linear interpolated output */
+ out = b1 + b2 * xdiff + b3 * ydiff + b4 * xdiff * ydiff;
+
+ /* return to application */
+ return (out);
+ }
+
+ /**
+ * @} end of BilinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c
new file mode 100644
index 0000000000..6726736655
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q15.c
@@ -0,0 +1,121 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q15.c
+ * Description: Q15 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @addtogroup BilinearInterpolate
+ * @{
+ */
+
+ /**
+ * @brief Q15 bilinear interpolation.
+ * @param[in,out] S points to an instance of the interpolation structure.
+ * @param[in] X interpolation coordinate in 12.20 format.
+ * @param[in] Y interpolation coordinate in 12.20 format.
+ * @return out interpolated value.
+ */
+ q15_t arm_bilinear_interp_q15(
+ arm_bilinear_interp_instance_q15 * S,
+ q31_t X,
+ q31_t Y)
+ {
+ q63_t acc = 0; /* output */
+ q31_t out; /* Temporary output */
+ q15_t x1, x2, y1, y2; /* Nearest output values */
+ q31_t xfract, yfract; /* X, Y fractional parts */
+ int32_t rI, cI; /* Row and column indices */
+ q15_t *pYData = S->pData; /* pointer to output table values */
+ uint32_t nCols = S->numCols; /* num of rows */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+ /* Care taken for table outside boundary */
+ /* Returns zero output when values are outside table boundary */
+ if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+ {
+ return (0);
+ }
+
+ /* 20 bits for the fractional part */
+ /* xfract should be in 12.20 format */
+ xfract = (X & 0x000FFFFF);
+
+ /* Read two nearest output values from the index */
+ x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) ];
+ x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+ /* 20 bits for the fractional part */
+ /* yfract should be in 12.20 format */
+ yfract = (Y & 0x000FFFFF);
+
+ /* Read two nearest output values from the index */
+ y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) ];
+ y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+ /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 13.51 format */
+
+ /* x1 is in 1.15(q15), xfract in 12.20 format and out is in 13.35 format */
+ /* convert 13.35 to 13.31 by right shifting and out is in 1.31 */
+ out = (q31_t) (((q63_t) x1 * (0x0FFFFF - xfract)) >> 4U);
+ acc = ((q63_t) out * (0x0FFFFF - yfract));
+
+ /* x2 * (xfract) * (1-yfract) in 1.51 and adding to acc */
+ out = (q31_t) (((q63_t) x2 * (0x0FFFFF - yfract)) >> 4U);
+ acc += ((q63_t) out * (xfract));
+
+ /* y1 * (1 - xfract) * (yfract) in 1.51 and adding to acc */
+ out = (q31_t) (((q63_t) y1 * (0x0FFFFF - xfract)) >> 4U);
+ acc += ((q63_t) out * (yfract));
+
+ /* y2 * (xfract) * (yfract) in 1.51 and adding to acc */
+ out = (q31_t) (((q63_t) y2 * (xfract)) >> 4U);
+ acc += ((q63_t) out * (yfract));
+
+ /* acc is in 13.51 format and down shift acc by 36 times */
+ /* Convert out to 1.15 format */
+ return ((q15_t)(acc >> 36));
+ }
+
+
+ /**
+ * @} end of BilinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c
new file mode 100644
index 0000000000..cc8560ec84
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q31.c
@@ -0,0 +1,119 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q31.c
+ * Description: Q31 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+
+/**
+ * @addtogroup BilinearInterpolate
+ * @{
+ */
+
+ /**
+ * @brief Q31 bilinear interpolation.
+ * @param[in,out] S points to an instance of the interpolation structure.
+ * @param[in] X interpolation coordinate in 12.20 format.
+ * @param[in] Y interpolation coordinate in 12.20 format.
+ * @return out interpolated value.
+ */
+ q31_t arm_bilinear_interp_q31(
+ arm_bilinear_interp_instance_q31 * S,
+ q31_t X,
+ q31_t Y)
+ {
+ q31_t out; /* Temporary output */
+ q31_t acc = 0; /* output */
+ q31_t xfract, yfract; /* X, Y fractional parts */
+ q31_t x1, x2, y1, y2; /* Nearest output values */
+ int32_t rI, cI; /* Row and column indices */
+ q31_t *pYData = S->pData; /* pointer to output table values */
+ uint32_t nCols = S->numCols; /* num of rows */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+ /* Care taken for table outside boundary */
+ /* Returns zero output when values are outside table boundary */
+ if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+ {
+ return (0);
+ }
+
+ /* 20 bits for the fractional part */
+ /* shift left xfract by 11 to keep 1.31 format */
+ xfract = (X & 0x000FFFFF) << 11U;
+
+ /* Read two nearest output values from the index */
+ x1 = pYData[(rI) + (int32_t)nCols * (cI) ];
+ x2 = pYData[(rI) + (int32_t)nCols * (cI) + 1];
+
+ /* 20 bits for the fractional part */
+ /* shift left yfract by 11 to keep 1.31 format */
+ yfract = (Y & 0x000FFFFF) << 11U;
+
+ /* Read two nearest output values from the index */
+ y1 = pYData[(rI) + (int32_t)nCols * (cI + 1) ];
+ y2 = pYData[(rI) + (int32_t)nCols * (cI + 1) + 1];
+
+ /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 3.29(q29) format */
+ out = ((q31_t) (((q63_t) x1 * (0x7FFFFFFF - xfract)) >> 32));
+ acc = ((q31_t) (((q63_t) out * (0x7FFFFFFF - yfract)) >> 32));
+
+ /* x2 * (xfract) * (1-yfract) in 3.29(q29) and adding to acc */
+ out = ((q31_t) ((q63_t) x2 * (0x7FFFFFFF - yfract) >> 32));
+ acc += ((q31_t) ((q63_t) out * (xfract) >> 32));
+
+ /* y1 * (1 - xfract) * (yfract) in 3.29(q29) and adding to acc */
+ out = ((q31_t) ((q63_t) y1 * (0x7FFFFFFF - xfract) >> 32));
+ acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+ /* y2 * (xfract) * (yfract) in 3.29(q29) and adding to acc */
+ out = ((q31_t) ((q63_t) y2 * (xfract) >> 32));
+ acc += ((q31_t) ((q63_t) out * (yfract) >> 32));
+
+ /* Convert acc to 1.31(q31) format */
+ return ((q31_t)(acc << 2));
+ }
+
+
+
+ /**
+ * @} end of BilinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c
new file mode 100644
index 0000000000..204342e333
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_bilinear_interp_q7.c
@@ -0,0 +1,117 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q7.c
+ * Description: Q7 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+
+/**
+ * @addtogroup BilinearInterpolate
+ * @{
+ */
+
+/**
+ * @brief Q7 bilinear interpolation.
+ * @param[in,out] S points to an instance of the interpolation structure.
+ * @param[in] X interpolation coordinate in 12.20 format.
+ * @param[in] Y interpolation coordinate in 12.20 format.
+ * @return out interpolated value.
+ */
+ q7_t arm_bilinear_interp_q7(
+ arm_bilinear_interp_instance_q7 * S,
+ q31_t X,
+ q31_t Y)
+ {
+ q63_t acc = 0; /* output */
+ q31_t out; /* Temporary output */
+ q31_t xfract, yfract; /* X, Y fractional parts */
+ q7_t x1, x2, y1, y2; /* Nearest output values */
+ int32_t rI, cI; /* Row and column indices */
+ q7_t *pYData = S->pData; /* pointer to output table values */
+ uint32_t nCols = S->numCols; /* num of rows */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ rI = ((X & (q31_t)0xFFF00000) >> 20);
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ cI = ((Y & (q31_t)0xFFF00000) >> 20);
+
+ /* Care taken for table outside boundary */
+ /* Returns zero output when values are outside table boundary */
+ if (rI < 0 || rI > (S->numCols - 2) || cI < 0 || cI > (S->numRows - 2))
+ {
+ return (0);
+ }
+
+ /* 20 bits for the fractional part */
+ /* xfract should be in 12.20 format */
+ xfract = (X & (q31_t)0x000FFFFF);
+
+ /* Read two nearest output values from the index */
+ x1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) ];
+ x2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI) + 1];
+
+ /* 20 bits for the fractional part */
+ /* yfract should be in 12.20 format */
+ yfract = (Y & (q31_t)0x000FFFFF);
+
+ /* Read two nearest output values from the index */
+ y1 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) ];
+ y2 = pYData[((uint32_t)rI) + nCols * ((uint32_t)cI + 1) + 1];
+
+ /* Calculation of x1 * (1-xfract ) * (1-yfract) and acc is in 16.47 format */
+ out = ((x1 * (0xFFFFF - xfract)));
+ acc = (((q63_t) out * (0xFFFFF - yfract)));
+
+ /* x2 * (xfract) * (1-yfract) in 2.22 and adding to acc */
+ out = ((x2 * (0xFFFFF - yfract)));
+ acc += (((q63_t) out * (xfract)));
+
+ /* y1 * (1 - xfract) * (yfract) in 2.22 and adding to acc */
+ out = ((y1 * (0xFFFFF - xfract)));
+ acc += (((q63_t) out * (yfract)));
+
+ /* y2 * (xfract) * (yfract) in 2.22 and adding to acc */
+ out = ((y2 * (yfract)));
+ acc += (((q63_t) out * (xfract)));
+
+ /* acc in 16.47 format and down shift by 40 to convert to 1.7 format */
+ return ((q7_t)(acc >> 40));
+ }
+
+ /**
+ * @} end of BilinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c
new file mode 100644
index 0000000000..05cd18b9ab
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_f16.c
+ * Description: Floating-point linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup LinearInterpolate Linear Interpolation
+ *
+ * Linear interpolation is a method of curve fitting using linear polynomials.
+ * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+ *
+ * \par
+ * \image html LinearInterp.gif "Linear interpolation"
+ *
+ * \par
+ * A Linear Interpolate function calculates an output value(y), for the input(x)
+ * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+ *
+ * \par Algorithm:
+ *
+ * y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+ * where x0, x1 are nearest values of input x
+ * y0, y1 are nearest values to output y
+ *
+ *
+ * \par
+ * This set of functions implements Linear interpolation process
+ * for Q7, Q15, Q31, and floating-point data types. The functions operate on a single
+ * sample of data and each call to the function returns a single processed value.
+ * S points to an instance of the Linear Interpolate function data structure.
+ * x is the input sample value. The functions returns the output value.
+ *
+ * \par
+ * if x is outside of the table boundary, Linear interpolation returns first value of the table
+ * if x is below input range and returns last value of table if x is above range.
+ */
+
+/**
+ * @addtogroup LinearInterpolate
+ * @{
+ */
+
+ /**
+ * @brief Process function for the floating-point Linear Interpolation Function.
+ * @param[in,out] S is an instance of the floating-point Linear Interpolation structure
+ * @param[in] x input sample to process
+ * @return y processed output sample.
+ *
+ */
+ float16_t arm_linear_interp_f16(
+ arm_linear_interp_instance_f16 * S,
+ float16_t x)
+ {
+ float16_t y;
+ float16_t x0, x1; /* Nearest input values */
+ float16_t y0, y1; /* Nearest output values */
+ float16_t xSpacing = S->xSpacing; /* spacing between input values */
+ int32_t i; /* Index variable */
+ float16_t *pYData = S->pYData; /* pointer to output table */
+
+ /* Calculation of index */
+ i = (int32_t) ((x - S->x1) / xSpacing);
+
+ if (i < 0)
+ {
+ /* Iniatilize output for below specified range as least output value of table */
+ y = pYData[0];
+ }
+ else if ((uint32_t)i >= (S->nValues - 1))
+ {
+ /* Iniatilize output for above specified range as last output value of table */
+ y = pYData[S->nValues - 1];
+ }
+ else
+ {
+ /* Calculation of nearest input values */
+ x0 = S->x1 + i * xSpacing;
+ x1 = S->x1 + (i + 1) * xSpacing;
+
+ /* Read of nearest output values */
+ y0 = pYData[i];
+ y1 = pYData[i + 1];
+
+ /* Calculation of output */
+ y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+ }
+
+ /* returns output value */
+ return (y);
+ }
+
+ /**
+ * @} end of LinearInterpolate group
+ */
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c
new file mode 100644
index 0000000000..fc60bec454
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_f32.c
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_f32.c
+ * Description: Floating-point linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @defgroup LinearInterpolate Linear Interpolation
+ *
+ * Linear interpolation is a method of curve fitting using linear polynomials.
+ * Linear interpolation works by effectively drawing a straight line between two neighboring samples and returning the appropriate point along that line
+ *
+ * \par
+ * \image html LinearInterp.gif "Linear interpolation"
+ *
+ * \par
+ * A Linear Interpolate function calculates an output value(y), for the input(x)
+ * using linear interpolation of the input values x0, x1( nearest input values) and the output values y0 and y1(nearest output values)
+ *
+ * \par Algorithm:
+ *
+ * y = y0 + (x - x0) * ((y1 - y0)/(x1-x0))
+ * where x0, x1 are nearest values of input x
+ * y0, y1 are nearest values to output y
+ *
+ *
+ * \par
+ * This set of functions implements Linear interpolation process
+ * for Q7, Q15, Q31, and floating-point data types. The functions operate on a single
+ * sample of data and each call to the function returns a single processed value.
+ * S points to an instance of the Linear Interpolate function data structure.
+ * x is the input sample value. The functions returns the output value.
+ *
+ * \par
+ * if x is outside of the table boundary, Linear interpolation returns first value of the table
+ * if x is below input range and returns last value of table if x is above range.
+ */
+
+/**
+ * @addtogroup LinearInterpolate
+ * @{
+ */
+
+ /**
+ * @brief Process function for the floating-point Linear Interpolation Function.
+ * @param[in,out] S is an instance of the floating-point Linear Interpolation structure
+ * @param[in] x input sample to process
+ * @return y processed output sample.
+ *
+ */
+ float32_t arm_linear_interp_f32(
+ arm_linear_interp_instance_f32 * S,
+ float32_t x)
+ {
+ float32_t y;
+ float32_t x0, x1; /* Nearest input values */
+ float32_t y0, y1; /* Nearest output values */
+ float32_t xSpacing = S->xSpacing; /* spacing between input values */
+ int32_t i; /* Index variable */
+ float32_t *pYData = S->pYData; /* pointer to output table */
+
+ /* Calculation of index */
+ i = (int32_t) ((x - S->x1) / xSpacing);
+
+ if (i < 0)
+ {
+ /* Iniatilize output for below specified range as least output value of table */
+ y = pYData[0];
+ }
+ else if ((uint32_t)i >= (S->nValues - 1))
+ {
+ /* Iniatilize output for above specified range as last output value of table */
+ y = pYData[S->nValues - 1];
+ }
+ else
+ {
+ /* Calculation of nearest input values */
+ x0 = S->x1 + i * xSpacing;
+ x1 = S->x1 + (i + 1) * xSpacing;
+
+ /* Read of nearest output values */
+ y0 = pYData[i];
+ y1 = pYData[i + 1];
+
+ /* Calculation of output */
+ y = y0 + (x - x0) * ((y1 - y0) / (x1 - x0));
+
+ }
+
+ /* returns output value */
+ return (y);
+ }
+
+ /**
+ * @} end of LinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c
new file mode 100644
index 0000000000..70019b666b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q15.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q15.c
+ * Description: Q15 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ * @addtogroup LinearInterpolate
+ * @{
+ */
+
+ /**
+ *
+ * @brief Process function for the Q15 Linear Interpolation Function.
+ * @param[in] pYData pointer to Q15 Linear Interpolation table
+ * @param[in] x input sample to process
+ * @param[in] nValues number of table values
+ * @return y processed output sample.
+ *
+ * \par
+ * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+ * This function can support maximum of table size 2^12.
+ *
+ */
+ q15_t arm_linear_interp_q15(
+ q15_t * pYData,
+ q31_t x,
+ uint32_t nValues)
+ {
+ q63_t y; /* output */
+ q15_t y0, y1; /* Nearest output values */
+ q31_t fract; /* fractional part */
+ int32_t index; /* Index to read nearest output values */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ index = ((x & (int32_t)0xFFF00000) >> 20);
+
+ if (index >= (int32_t)(nValues - 1))
+ {
+ return (pYData[nValues - 1]);
+ }
+ else if (index < 0)
+ {
+ return (pYData[0]);
+ }
+ else
+ {
+ /* 20 bits for the fractional part */
+ /* fract is in 12.20 format */
+ fract = (x & 0x000FFFFF);
+
+ /* Read two nearest output values from the index */
+ y0 = pYData[index];
+ y1 = pYData[index + 1];
+
+ /* Calculation of y0 * (1-fract) and y is in 13.35 format */
+ y = ((q63_t) y0 * (0xFFFFF - fract));
+
+ /* Calculation of (y0 * (1-fract) + y1 * fract) and y is in 13.35 format */
+ y += ((q63_t) y1 * (fract));
+
+ /* convert y to 1.15 format */
+ return (q15_t) (y >> 20);
+ }
+ }
+
+
+ /**
+ * @} end of LinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c
new file mode 100644
index 0000000000..55348edab1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q31.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q31.c
+ * Description: Q31 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+
+/**
+ * @addtogroup LinearInterpolate
+ * @{
+ */
+
+ /**
+ *
+ * @brief Process function for the Q31 Linear Interpolation Function.
+ * @param[in] pYData pointer to Q31 Linear Interpolation table
+ * @param[in] x input sample to process
+ * @param[in] nValues number of table values
+ * @return y processed output sample.
+ *
+ * \par
+ * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+ * This function can support maximum of table size 2^12.
+ *
+ */
+ q31_t arm_linear_interp_q31(
+ q31_t * pYData,
+ q31_t x,
+ uint32_t nValues)
+ {
+ q31_t y; /* output */
+ q31_t y0, y1; /* Nearest output values */
+ q31_t fract; /* fractional part */
+ int32_t index; /* Index to read nearest output values */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ index = ((x & (q31_t)0xFFF00000) >> 20);
+
+ if (index >= (int32_t)(nValues - 1))
+ {
+ return (pYData[nValues - 1]);
+ }
+ else if (index < 0)
+ {
+ return (pYData[0]);
+ }
+ else
+ {
+ /* 20 bits for the fractional part */
+ /* shift left by 11 to keep fract in 1.31 format */
+ fract = (x & 0x000FFFFF) << 11;
+
+ /* Read two nearest output values from the index in 1.31(q31) format */
+ y0 = pYData[index];
+ y1 = pYData[index + 1];
+
+ /* Calculation of y0 * (1-fract) and y is in 2.30 format */
+ y = ((q31_t) ((q63_t) y0 * (0x7FFFFFFF - fract) >> 32));
+
+ /* Calculation of y0 * (1-fract) + y1 *fract and y is in 2.30 format */
+ y += ((q31_t) (((q63_t) y1 * fract) >> 32));
+
+ /* Convert y to 1.31 format */
+ return (y << 1U);
+ }
+ }
+
+
+
+ /**
+ * @} end of LinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c
new file mode 100644
index 0000000000..db34b01eca
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_linear_interp_q7.c
@@ -0,0 +1,99 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_linear_interp_q7.c
+ * Description: Q7 linear interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+
+/**
+ * @addtogroup LinearInterpolate
+ * @{
+ */
+
+ /**
+ *
+ * @brief Process function for the Q7 Linear Interpolation Function.
+ * @param[in] pYData pointer to Q7 Linear Interpolation table
+ * @param[in] x input sample to process
+ * @param[in] nValues number of table values
+ * @return y processed output sample.
+ *
+ * \par
+ * Input sample x is in 12.20 format which contains 12 bits for table index and 20 bits for fractional part.
+ * This function can support maximum of table size 2^12.
+ */
+ q7_t arm_linear_interp_q7(
+ q7_t * pYData,
+ q31_t x,
+ uint32_t nValues)
+ {
+ q31_t y; /* output */
+ q7_t y0, y1; /* Nearest output values */
+ q31_t fract; /* fractional part */
+ uint32_t index; /* Index to read nearest output values */
+
+ /* Input is in 12.20 format */
+ /* 12 bits for the table index */
+ /* Index value calculation */
+ if (x < 0)
+ {
+ return (pYData[0]);
+ }
+ index = (x >> 20) & 0xfff;
+
+ if (index >= (nValues - 1))
+ {
+ return (pYData[nValues - 1]);
+ }
+ else
+ {
+ /* 20 bits for the fractional part */
+ /* fract is in 12.20 format */
+ fract = (x & 0x000FFFFF);
+
+ /* Read two nearest output values from the index and are in 1.7(q7) format */
+ y0 = pYData[index];
+ y1 = pYData[index + 1];
+
+ /* Calculation of y0 * (1-fract ) and y is in 13.27(q27) format */
+ y = ((y0 * (0xFFFFF - fract)));
+
+ /* Calculation of y1 * fract + y0 * (1-fract) and y is in 13.27(q27) format */
+ y += (y1 * fract);
+
+ /* convert y to 1.7(q7) format */
+ return (q7_t) (y >> 20);
+ }
+ }
+ /**
+ * @} end of LinearInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c
new file mode 100644
index 0000000000..868cec5bcc
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_f32.c
@@ -0,0 +1,283 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_spline_interp_f32.c
+ * Description: Floating-point cubic spline interpolation
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ @defgroup SplineInterpolate Cubic Spline Interpolation
+
+ Spline interpolation is a method of interpolation where the interpolant
+ is a piecewise-defined polynomial called "spline".
+
+ @par Introduction
+
+ Given a function f defined on the interval [a,b], a set of n nodes x(i)
+ where a=x(1)
+ S1(x) x(1) < x < x(2)
+ S(x) = ...
+ Sn-1(x) x(n-1) < x < x(n)
+
+
+ where
+
+
+ Si(x) = a_i+b_i(x-xi)+c_i(x-xi)^2+d_i(x-xi)^3 i=1, ..., n-1
+
+
+ @par Algorithm
+
+ Having defined h(i) = x(i+1) - x(i)
+
+
+ h(i-1)c(i-1)+2[h(i-1)+h(i)]c(i)+h(i)c(i+1) = 3/h(i)*[a(i+1)-a(i)]-3/h(i-1)*[a(i)-a(i-1)] i=2, ..., n-1
+
+
+ It is possible to write the previous conditions in matrix form (Ax=B).
+ In order to solve the system two boundary conidtions are needed.
+ - Natural spline: S1''(x1)=2*c(1)=0 ; Sn''(xn)=2*c(n)=0
+ In matrix form:
+
+
+ | 1 0 0 ... 0 0 0 || c(1) | | 0 |
+ | h(0) 2[h(0)+h(1)] h(1) ... 0 0 0 || c(2) | | 3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)] |
+ | ... ... ... ... ... ... ... || ... |=| ... |
+ | 0 0 0 ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ | 0 0 0 ... 0 0 1 || c(n) | | 0 |
+
+
+ - Parabolic runout spline: S1''(x1)=2*c(1)=S2''(x2)=2*c(2) ; Sn-1''(xn-1)=2*c(n-1)=Sn''(xn)=2*c(n)
+ In matrix form:
+
+
+ | 1 -1 0 ... 0 0 0 || c(1) | | 0 |
+ | h(0) 2[h(0)+h(1)] h(1) ... 0 0 0 || c(2) | | 3/h(2)*[a(3)-a(2)]-3/h(1)*[a(2)-a(1)] |
+ | ... ... ... ... ... ... ... || ... |=| ... |
+ | 0 0 0 ... h(n-2) 2[h(n-2)+h(n-1)] h(n-1) || c(n-1) | | 3/h(n-1)*[a(n)-a(n-1)]-3/h(n-2)*[a(n-1)-a(n-2)] |
+ | 0 0 0 ... 0 -1 1 || c(n) | | 0 |
+
+
+ A is a tridiagonal matrix (a band matrix of bandwidth 3) of size N=n+1. The factorization
+ algorithms (A=LU) can be simplified considerably because a large number of zeros appear
+ in regular patterns. The Crout method has been used:
+ 1) Solve LZ=B
+
+
+ u(1,2) = A(1,2)/A(1,1)
+ z(1) = B(1)/l(11)
+
+ FOR i=2, ..., N-1
+ l(i,i) = A(i,i)-A(i,i-1)u(i-1,i)
+ u(i,i+1) = a(i,i+1)/l(i,i)
+ z(i) = [B(i)-A(i,i-1)z(i-1)]/l(i,i)
+
+ l(N,N) = A(N,N)-A(N,N-1)u(N-1,N)
+ z(N) = [B(N)-A(N,N-1)z(N-1)]/l(N,N)
+
+
+ 2) Solve UX=Z
+
+
+ c(N)=z(N)
+
+ FOR i=N-1, ..., 1
+ c(i)=z(i)-u(i,i+1)c(i+1)
+
+
+ c(i) for i=1, ..., n-1 are needed to compute the n-1 polynomials.
+ b(i) and d(i) are computed as:
+ - b(i) = [y(i+1)-y(i)]/h(i)-h(i)*[c(i+1)+2*c(i)]/3
+ - d(i) = [c(i+1)-c(i)]/[3*h(i)]
+ Moreover, a(i)=y(i).
+
+ @par Behaviour outside the given intervals
+
+ It is possible to compute the interpolated vector for x values outside the
+ input range (xqx(n)). The coefficients used to compute the y values for
+ xqx(n) the
+ coefficients used for the last interval.
+
+ */
+
+/**
+ @addtogroup SplineInterpolate
+ @{
+ */
+
+/**
+ * @brief Processing function for the floating-point cubic spline interpolation.
+ * @param[in] S points to an instance of the floating-point spline structure.
+ * @param[in] xq points to the x values of the interpolated data points.
+ * @param[out] pDst points to the block of output data.
+ * @param[in] blockSize number of samples of output data.
+ */
+
+void arm_spline_f32(
+ arm_spline_instance_f32 * S,
+ const float32_t * xq,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t * x = S->x;
+ const float32_t * y = S->y;
+ int32_t n = S->n_x;
+
+ /* Coefficients (a==y for i<=n-1) */
+ float32_t * b = (S->coeffs);
+ float32_t * c = (S->coeffs)+(n-1);
+ float32_t * d = (S->coeffs)+(2*(n-1));
+
+ const float32_t * pXq = xq;
+ int32_t blkCnt = (int32_t)blockSize;
+ int32_t blkCnt2;
+ int32_t i;
+ float32_t x_sc;
+
+#ifdef ARM_MATH_NEON
+ float32x4_t xiv;
+ float32x4_t aiv;
+ float32x4_t biv;
+ float32x4_t civ;
+ float32x4_t div;
+
+ float32x4_t xqv;
+
+ float32x4_t temp;
+ float32x4_t diff;
+ float32x4_t yv;
+#endif
+
+ /* Create output for x(i) 4 )
+ {
+ /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+ xqv = vld1q_f32(pXq);
+ pXq+=4;
+
+ /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+ diff = vsubq_f32(xqv, xiv);
+ temp = diff;
+
+ /* y(i) = a(i) + ... */
+ yv = aiv;
+ /* ... + b(i)*(x-x(i)) + ... */
+ yv = vmlaq_f32(yv, biv, temp);
+ /* ... + c(i)*(x-x(i))^2 + ... */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, civ, temp);
+ /* ... + d(i)*(x-x(i))^3 */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, div, temp);
+
+ /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+ vst1q_f32(pDst, yv);
+ pDst+=4;
+
+ blkCnt-=4;
+ }
+#endif
+ while( *pXq <= x[i+1] && blkCnt > 0 )
+ {
+ x_sc = *pXq++;
+
+ *pDst = y[i]+b[i]*(x_sc-x[i])+c[i]*(x_sc-x[i])*(x_sc-x[i])+d[i]*(x_sc-x[i])*(x_sc-x[i])*(x_sc-x[i]);
+
+ pDst++;
+ blkCnt--;
+ }
+ }
+
+ /* Create output for remaining samples (x>=x(n)) */
+#ifdef ARM_MATH_NEON
+ /* Compute 4 outputs at a time */
+ blkCnt2 = blkCnt >> 2;
+
+ while(blkCnt2 > 0)
+ {
+ /* Load [xq(k) xq(k+1) xq(k+2) xq(k+3)] */
+ xqv = vld1q_f32(pXq);
+ pXq+=4;
+
+ /* Compute [xq(k)-x(i) xq(k+1)-x(i) xq(k+2)-x(i) xq(k+3)-x(i)] */
+ diff = vsubq_f32(xqv, xiv);
+ temp = diff;
+
+ /* y(i) = a(i) + ... */
+ yv = aiv;
+ /* ... + b(i)*(x-x(i)) + ... */
+ yv = vmlaq_f32(yv, biv, temp);
+ /* ... + c(i)*(x-x(i))^2 + ... */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, civ, temp);
+ /* ... + d(i)*(x-x(i))^3 */
+ temp = vmulq_f32(temp, diff);
+ yv = vmlaq_f32(yv, div, temp);
+
+ /* Store [y(k) y(k+1) y(k+2) y(k+3)] */
+ vst1q_f32(pDst, yv);
+ pDst+=4;
+
+ blkCnt2--;
+ }
+
+ /* Tail */
+ blkCnt2 = blkCnt & 3;
+#else
+ blkCnt2 = blkCnt;
+#endif
+
+ while(blkCnt2 > 0)
+ {
+ x_sc = *pXq++;
+
+ *pDst = y[i-1]+b[i-1]*(x_sc-x[i-1])+c[i-1]*(x_sc-x[i-1])*(x_sc-x[i-1])+d[i-1]*(x_sc-x[i-1])*(x_sc-x[i-1])*(x_sc-x[i-1]);
+
+ pDst++;
+ blkCnt2--;
+ }
+}
+
+/**
+ @} end of SplineInterpolate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c
new file mode 100644
index 0000000000..4fa8cf9a59
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/InterpolationFunctions/arm_spline_interp_init_f32.c
@@ -0,0 +1,175 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_spline_interp_init_f32.c
+ * Description: Floating-point cubic spline initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/interpolation_functions.h"
+
+/**
+ @ingroup groupInterpolation
+ */
+
+/**
+ @addtogroup SplineInterpolate
+ @{
+
+ @par Initialization function
+
+ The initialization function takes as input two arrays that the user has to allocate:
+ coeffs will contain the b, c, and d coefficients for the (n-1) intervals
+ (n is the number of known points), hence its size must be 3*(n-1); tempBuffer
+ is temporally used for internal computations and its size is n+n-1.
+
+ @par
+
+ The x input array must be strictly sorted in ascending order and it must
+ not contain twice the same value (x(i)x = x;
+ S->y = y;
+ S->n_x = n;
+
+ /* == Solve LZ=B to obtain z(i) and u(i) == */
+
+ /* -- Row 1 -- */
+ /* B(0) = 0, not computed */
+ /* u(1,2) = a(1,2)/a(1,1) = a(1,2) */
+ if(type == ARM_SPLINE_NATURAL)
+ u[0] = 0; /* a(1,2) = 0 */
+ else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+ u[0] = -1; /* a(1,2) = -1 */
+
+ z[0] = 0; /* z(1) = B(1)/a(1,1) = 0 always */
+
+ /* -- Rows 2 to N-1 (N=n+1) -- */
+ hm1 = x[1] - x[0]; /* Initialize h(i-1) = h(1) = x(2)-x(1) */
+
+ for (i=1; i<(int32_t)n-1; i++)
+ {
+ /* Compute B(i) */
+ hi = x[i+1]-x[i];
+ Bi = 3*(y[i+1]-y[i])/hi - 3*(y[i]-y[i-1])/hm1;
+
+ /* l(i) = a(i)-a(i,i-1)*u(i-1) = 2[h(i-1)+h(i)]-h(i-1)*u(i-1) */
+ li = 2*(hi+hm1) - hm1*u[i-1];
+
+ /* u(i) = a(i,i+1)/l(i) = h(i)/l(i) */
+ u[i] = hi/li;
+
+ /* z(i) = [B(i)-h(i-1)*z(i-1)]/l(i) */
+ z[i] = (Bi-hm1*z[i-1])/li;
+
+ /* Update h(i-1) for next iteration */
+ hm1 = hi;
+ }
+
+ /* -- Row N -- */
+ /* l(N) = a(N,N)-a(N,N-1)u(N-1) */
+ /* z(N) = [-a(N,N-1)z(N-1)]/l(N) */
+ if(type == ARM_SPLINE_NATURAL)
+ {
+ /* li = 1; a(N,N) = 1; a(N,N-1) = 0 */
+ z[n-1] = 0; /* a(N,N-1) = 0 */
+ }
+ else if(type == ARM_SPLINE_PARABOLIC_RUNOUT)
+ {
+ li = 1+u[n-2]; /* a(N,N) = 1; a(N,N-1) = -1 */
+ z[n-1] = z[n-2]/li; /* a(N,N-1) = -1 */
+ }
+
+ /* == Solve UX = Z to obtain c(i) and */
+ /* compute b(i) and d(i) from c(i) == */
+
+ cp1 = z[n-1]; /* Initialize c(i+1) = c(N) = z(N) */
+
+ for (i=n-2; i>=0; i--)
+ {
+ /* c(i) = z(i)-u(i+1)c(i+1) */
+ c[i] = z[i]-u[i]*cp1;
+
+ hi = x[i+1]-x[i];
+ /* b(i) = [y(i+1)-y(i)]/h(i)-h(i)*[c(i+1)+2*c(i)]/3 */
+ b[i] = (y[i+1]-y[i])/hi-hi*(cp1+2*c[i])/3;
+
+ /* d(i) = [c(i+1)-c(i)]/[3*h(i)] */
+ d[i] = (cp1-c[i])/(3*hi);
+
+ /* Update c(i+1) for next iteration */
+ cp1 = c[i];
+ }
+
+ /* == Finally, store the coefficients in the instance == */
+
+ S->coeffs = coeffs;
+}
+
+/**
+ @} end of SplineInterpolate group
+ */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctions.c
new file mode 100644
index 0000000000..cad08519c8
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctions.c
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: MatrixFunctions.c
+ * Description: Combination of all matrix function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_mat_add_f32.c"
+#include "arm_mat_add_q15.c"
+#include "arm_mat_add_q31.c"
+#include "arm_mat_cmplx_mult_f32.c"
+#include "arm_mat_cmplx_mult_q15.c"
+#include "arm_mat_cmplx_mult_q31.c"
+#include "arm_mat_init_f32.c"
+#include "arm_mat_init_q15.c"
+#include "arm_mat_init_q31.c"
+#include "arm_mat_inverse_f32.c"
+#include "arm_mat_inverse_f64.c"
+#include "arm_mat_mult_f64.c"
+#include "arm_mat_mult_f32.c"
+#include "arm_mat_mult_fast_q15.c"
+#include "arm_mat_mult_fast_q31.c"
+#include "arm_mat_mult_q7.c"
+#include "arm_mat_mult_q15.c"
+#include "arm_mat_mult_q31.c"
+#include "arm_mat_scale_f32.c"
+#include "arm_mat_scale_q15.c"
+#include "arm_mat_scale_q31.c"
+#include "arm_mat_sub_f64.c"
+#include "arm_mat_sub_f32.c"
+#include "arm_mat_sub_q15.c"
+#include "arm_mat_sub_q31.c"
+#include "arm_mat_trans_f32.c"
+#include "arm_mat_trans_f64.c"
+#include "arm_mat_trans_q7.c"
+#include "arm_mat_trans_q15.c"
+#include "arm_mat_trans_q31.c"
+#include "arm_mat_vec_mult_f32.c"
+#include "arm_mat_vec_mult_q31.c"
+#include "arm_mat_vec_mult_q15.c"
+#include "arm_mat_vec_mult_q7.c"
+#include "arm_mat_cmplx_trans_f32.c"
+#include "arm_mat_cmplx_trans_q31.c"
+#include "arm_mat_cmplx_trans_q15.c"
+#include "arm_mat_cholesky_f64.c"
+#include "arm_mat_cholesky_f32.c"
+#include "arm_mat_solve_upper_triangular_f32.c"
+#include "arm_mat_solve_lower_triangular_f32.c"
+#include "arm_mat_solve_upper_triangular_f64.c"
+#include "arm_mat_solve_lower_triangular_f64.c"
+#include "arm_mat_ldlt_f32.c"
+#include "arm_mat_ldlt_f64.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctionsF16.c
new file mode 100644
index 0000000000..9d3a41fb3d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/MatrixFunctionsF16.c
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: MatrixFunctions.c
+ * Description: Combination of all matrix function f16 source files.
+ *
+ * $Date: 18. March 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_mat_add_f16.c"
+#include "arm_mat_sub_f16.c"
+#include "arm_mat_trans_f16.c"
+#include "arm_mat_scale_f16.c"
+#include "arm_mat_mult_f16.c"
+#include "arm_mat_vec_mult_f16.c"
+#include "arm_mat_cmplx_trans_f16.c"
+#include "arm_mat_cmplx_mult_f16.c"
+#include "arm_mat_inverse_f16.c"
+#include "arm_mat_init_f16.c"
+#include "arm_mat_cholesky_f16.c"
+#include "arm_mat_solve_upper_triangular_f16.c"
+#include "arm_mat_solve_lower_triangular_f16.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
new file mode 100644
index 0000000000..a674094012
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f16.c
@@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_add_f16.c
+ * Description: Floating-point matrix addition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixAdd
+ @{
+ */
+
+
+/**
+ @brief Floating-point matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_add_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ arm_status status;
+ uint32_t numSamples; /* total number of elements in the matrix */
+ float16_t *pDataA, *pDataB, *pDataDst;
+ f16x8_t vecA, vecB, vecDst;
+ float16_t const *pSrcAVec;
+ float16_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (float16_t const *) pDataA;
+ pSrcBVec = (float16_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 8;
+ vecDst = vaddq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ vecDst = vaddq_m(vecDst, vecA, vecB, p0);
+ vstrhq_p(pDataDst, vecDst, p0);
+ }
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+}
+#else
+
+arm_status arm_mat_add_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixAdd group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
new file mode 100644
index 0000000000..6963504e46
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_f32.c
@@ -0,0 +1,304 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_add_f32.c
+ * Description: Floating-point matrix addition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixAdd Matrix Addition
+
+ Adds two matrices.
+ \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
+
+ The functions check to make sure that
+ pSrcA, pSrcB, and pDst have the same
+ number of rows and columns.
+ */
+
+/**
+ @addtogroup MatrixAdd
+ @{
+ */
+
+
+/**
+ @brief Floating-point matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_add_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ arm_status status;
+ uint32_t numSamples; /* total number of elements in the matrix */
+ float32_t *pDataA, *pDataB, *pDataDst;
+ f32x4_t vecA, vecB, vecDst;
+ float32_t const *pSrcAVec;
+ float32_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (float32_t const *) pDataA;
+ pSrcBVec = (float32_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vaddq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ vecDst = vaddq_m(vecDst, vecA, vecB, p0);
+ vstrwq_p(pDataDst, vecDst, p0);
+ }
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+}
+#else
+#if defined(ARM_MATH_NEON)
+/*
+
+Neon version is assuming the matrix is small enough.
+So no blocking is used for taking into account cache effects.
+For big matrix, there exist better libraries for Neon.
+
+*/
+arm_status arm_mat_add_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
+
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+ blkCnt = numSamples >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ vec1 = vld1q_f32(pIn1);
+ vec2 = vld1q_f32(pIn2);
+ res = vaddq_f32(vec1, vec2);
+ vst1q_f32(pOut, res);
+
+ /* update pointers to process next samples */
+ pIn1 += 4U;
+ pIn2 += 4U;
+ pOut += 4U;
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ *pOut++ = (*pIn1++) + (*pIn2++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_add_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add and store result in destination buffer. */
+ *pOut++ = *pInA++ + *pInB++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixAdd group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
new file mode 100644
index 0000000000..597a8eac61
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q15.c
@@ -0,0 +1,227 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_add_q15.c
+ * Description: Q15 matrix addition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixAdd
+ @{
+ */
+
+/**
+ @brief Q15 matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_add_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ uint32_t numSamples; /* total number of elements in the matrix */
+ q15_t *pDataA, *pDataB, *pDataDst;
+ q15x8_t vecA, vecB, vecDst;
+ q15_t const *pSrcAVec;
+ q15_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (q15_t const *) pDataA;
+ pSrcBVec = (q15_t const *) pDataB;
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec); pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec); pSrcBVec += 8;
+ vecDst = vqaddq(vecA, vecB);
+ vst1q(pDataDst, vecDst); pDataDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecA = vld1q(pSrcAVec); pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec); pSrcBVec += 8;
+ vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
+ vstrhq_p(pDataDst, vecDst, p0);
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_add_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add, saturate and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
+
+ write_q15x2_ia (&pOut, __QADD16(read_q15x2_ia (&pInA), read_q15x2_ia (&pInB)));
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add, saturate and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) *pInA++ + *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixAdd group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
new file mode 100644
index 0000000000..3b7c8f15c2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_add_q31.c
@@ -0,0 +1,216 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_add_q31.c
+ * Description: Q31 matrix addition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixAdd
+ @{
+ */
+
+/**
+ @brief Q31 matrix addition.
+ @param[in] pSrcA points to first input matrix structure
+ @param[in] pSrcB points to second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_add_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ arm_status status; /* status of matrix addition */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ q31_t *pDataA, *pDataB, *pDataDst;
+ q31x4_t vecA, vecB, vecDst;
+ q31_t const *pSrcAVec;
+ q31_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (q31_t const *) pDataA;
+ pSrcBVec = (q31_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vqaddq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vqaddq_m(vecDst, vecA, vecB, p0);
+ vstrwq_p(pDataDst, vecDst, p0);
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_add_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add, saturate and store result in destination buffer. */
+ *pOut++ = __QADD(*pInA++, *pInB++);
+
+ *pOut++ = __QADD(*pInA++, *pInB++);
+
+ *pOut++ = __QADD(*pInA++, *pInB++);
+
+ *pOut++ = __QADD(*pInA++, *pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+
+ /* Add, saturate and store result in destination buffer. */
+ *pOut++ = __QADD(*pInA++, *pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixAdd group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c
new file mode 100644
index 0000000000..0fa259edb6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f16.c
@@ -0,0 +1,253 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cholesky_f16.c
+ * Description: Floating-point Cholesky decomposition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pDst points to the instance of the output floating-point matrix structure.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+ * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cholesky_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ _Float16 invSqrtVj;
+ float16_t *pA,*pG;
+ int kCnt;
+
+ mve_pred16_t p0;
+
+ f16x8_t acc, acc0, acc1, acc2, acc3;
+ f16x8_t vecGi;
+ f16x8_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+ for(i=0 ;i < n ; i++)
+ {
+ for(j=i ; j+3 < n ; j+=4)
+ {
+ acc0 = vdupq_n_f16(0.0f16);
+ acc0[0]=pA[(j + 0) * n + i];
+
+ acc1 = vdupq_n_f16(0.0f16);
+ acc1[0]=pA[(j + 1) * n + i];
+
+ acc2 = vdupq_n_f16(0.0f16);
+ acc2[0]=pA[(j + 2) * n + i];
+
+ acc3 = vdupq_n_f16(0.0f16);
+ acc3[0]=pA[(j + 3) * n + i];
+
+ kCnt = i;
+ for(k=0; k < i ; k+=8)
+ {
+ p0 = vctp16q(kCnt);
+
+ vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
+
+ vecGj0=vldrhq_z_f16(&pG[(j + 0) * n + k],p0);
+ vecGj1=vldrhq_z_f16(&pG[(j + 1) * n + k],p0);
+ vecGj2=vldrhq_z_f16(&pG[(j + 2) * n + k],p0);
+ vecGj3=vldrhq_z_f16(&pG[(j + 3) * n + k],p0);
+
+ acc0 = vfmsq_m(acc0, vecGi, vecGj0, p0);
+ acc1 = vfmsq_m(acc1, vecGi, vecGj1, p0);
+ acc2 = vfmsq_m(acc2, vecGi, vecGj2, p0);
+ acc3 = vfmsq_m(acc3, vecGi, vecGj3, p0);
+
+ kCnt -= 8;
+ }
+ pG[(j + 0) * n + i] = vecAddAcrossF16Mve(acc0);
+ pG[(j + 1) * n + i] = vecAddAcrossF16Mve(acc1);
+ pG[(j + 2) * n + i] = vecAddAcrossF16Mve(acc2);
+ pG[(j + 3) * n + i] = vecAddAcrossF16Mve(acc3);
+ }
+
+ for(; j < n ; j++)
+ {
+
+ kCnt = i;
+ acc = vdupq_n_f16(0.0f16);
+ acc[0] = pA[j * n + i];
+
+ for(k=0; k < i ; k+=8)
+ {
+ p0 = vctp16q(kCnt);
+
+ vecGi=vldrhq_z_f16(&pG[i * n + k],p0);
+ vecGj=vldrhq_z_f16(&pG[j * n + k],p0);
+
+ acc = vfmsq_m(acc, vecGi, vecGj,p0);
+
+ kCnt -= 8;
+ }
+ pG[j * n + i] = vecAddAcrossF16Mve(acc);
+ }
+
+ if (pG[i * n + i] <= 0.0f16)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = (_Float16)1.0f/sqrtf(pG[i * n + i]);
+ for(j=i; j < n ; j++)
+ {
+ pG[j * n + i] = (_Float16)pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_cholesky_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float16_t invSqrtVj;
+ float16_t *pA,*pG;
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+
+ for(i=0 ; i < n ; i++)
+ {
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ for(k=0; k < i ; k++)
+ {
+ pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+ }
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixChol group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c
new file mode 100644
index 0000000000..a5e76791e9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f32.c
@@ -0,0 +1,436 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cholesky_f32.c
+ * Description: Floating-point Cholesky decomposition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixChol Cholesky and LDLT decompositions
+
+ Computes the Cholesky or LDL^t decomposition of a matrix.
+
+
+ If the input matrix does not have a decomposition, then the
+ algorithm terminates and returns error status ARM_MATH_DECOMPOSITION_FAILURE.
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pDst points to the instance of the output floating-point matrix structure.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+ * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cholesky_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float32_t invSqrtVj;
+ float32_t *pA,*pG;
+ int kCnt;
+
+ mve_pred16_t p0;
+
+ f32x4_t acc, acc0, acc1, acc2, acc3;
+ f32x4_t vecGi;
+ f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+ for(i=0 ;i < n ; i++)
+ {
+ for(j=i ; j+3 < n ; j+=4)
+ {
+ pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+ pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+ pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+ pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+ kCnt = i;
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ for(k=0; k < i ; k+=4)
+ {
+ p0 = vctp32q(kCnt);
+
+ vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
+
+ vecGj0=vldrwq_z_f32(&pG[(j + 0) * n + k],p0);
+ vecGj1=vldrwq_z_f32(&pG[(j + 1) * n + k],p0);
+ vecGj2=vldrwq_z_f32(&pG[(j + 2) * n + k],p0);
+ vecGj3=vldrwq_z_f32(&pG[(j + 3) * n + k],p0);
+
+ acc0 = vfmaq_m(acc0, vecGi, vecGj0, p0);
+ acc1 = vfmaq_m(acc1, vecGi, vecGj1, p0);
+ acc2 = vfmaq_m(acc2, vecGi, vecGj2, p0);
+ acc3 = vfmaq_m(acc3, vecGi, vecGj3, p0);
+
+ kCnt -= 4;
+ }
+ pG[(j + 0) * n + i] -= vecAddAcrossF32Mve(acc0);
+ pG[(j + 1) * n + i] -= vecAddAcrossF32Mve(acc1);
+ pG[(j + 2) * n + i] -= vecAddAcrossF32Mve(acc2);
+ pG[(j + 3) * n + i] -= vecAddAcrossF32Mve(acc3);
+ }
+
+ for(; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ kCnt = i;
+ acc = vdupq_n_f32(0.0f);
+
+ for(k=0; k < i ; k+=4)
+ {
+ p0 = vctp32q(kCnt);
+
+ vecGi=vldrwq_z_f32(&pG[i * n + k],p0);
+ vecGj=vldrwq_z_f32(&pG[j * n + k],p0);
+
+ acc = vfmaq_m(acc, vecGi, vecGj,p0);
+
+ kCnt -= 4;
+ }
+ pG[j * n + i] -= vecAddAcrossF32Mve(acc);
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ for(j=i; j < n ; j++)
+ {
+ pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_cholesky_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float32_t invSqrtVj;
+ float32_t *pA,*pG;
+ int kCnt;
+
+
+ f32x4_t acc, acc0, acc1, acc2, acc3;
+ f32x4_t vecGi;
+ f32x4_t vecGj,vecGj0,vecGj1,vecGj2,vecGj3;
+ f32x2_t tmp = vdup_n_f32(0);
+ float32_t sum=0.0f;
+ float32_t sum0=0.0f,sum1=0.0f,sum2=0.0f,sum3=0.0f;
+
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+ for(i=0 ;i < n ; i++)
+ {
+ for(j=i ; j+3 < n ; j+=4)
+ {
+ pG[(j + 0) * n + i] = pA[(j + 0) * n + i];
+ pG[(j + 1) * n + i] = pA[(j + 1) * n + i];
+ pG[(j + 2) * n + i] = pA[(j + 2) * n + i];
+ pG[(j + 3) * n + i] = pA[(j + 3) * n + i];
+
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ kCnt = i >> 2;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f32(&pG[i * n + k]);
+
+ vecGj0=vld1q_f32(&pG[(j + 0) * n + k]);
+ vecGj1=vld1q_f32(&pG[(j + 1) * n + k]);
+ vecGj2=vld1q_f32(&pG[(j + 2) * n + k]);
+ vecGj3=vld1q_f32(&pG[(j + 3) * n + k]);
+
+ acc0 = vfmaq_f32(acc0, vecGi, vecGj0);
+ acc1 = vfmaq_f32(acc1, vecGi, vecGj1);
+ acc2 = vfmaq_f32(acc2, vecGi, vecGj2);
+ acc3 = vfmaq_f32(acc3, vecGi, vecGj3);
+
+ kCnt--;
+ k+=4;
+ }
+
+#if __aarch64__
+ sum0 = vpadds_f32(vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0)));
+ sum1 = vpadds_f32(vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1)));
+ sum2 = vpadds_f32(vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2)));
+ sum3 = vpadds_f32(vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3)));
+
+#else
+ tmp = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum0 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+ tmp = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+ sum1 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+ tmp = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+ sum2 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+
+ tmp = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+ sum3 = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+#endif
+
+ kCnt = i & 3;
+ while(kCnt > 0)
+ {
+
+ sum0 = sum0 + pG[i * n + k] * pG[(j + 0) * n + k];
+ sum1 = sum1 + pG[i * n + k] * pG[(j + 1) * n + k];
+ sum2 = sum2 + pG[i * n + k] * pG[(j + 2) * n + k];
+ sum3 = sum3 + pG[i * n + k] * pG[(j + 3) * n + k];
+ kCnt--;
+ k++;
+ }
+
+ pG[(j + 0) * n + i] -= sum0;
+ pG[(j + 1) * n + i] -= sum1;
+ pG[(j + 2) * n + i] -= sum2;
+ pG[(j + 3) * n + i] -= sum3;
+ }
+
+ for(; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ acc = vdupq_n_f32(0.0f);
+
+ kCnt = i >> 2;
+ k=0;
+ while(kCnt > 0)
+ {
+
+ vecGi=vld1q_f32(&pG[i * n + k]);
+ vecGj=vld1q_f32(&pG[j * n + k]);
+
+ acc = vfmaq_f32(acc, vecGi, vecGj);
+
+ kCnt--;
+ k+=4;
+ }
+
+#if __aarch64__
+ sum = vpadds_f32(vpadd_f32(vget_low_f32(acc), vget_high_f32(acc)));
+#else
+ tmp = vpadd_f32(vget_low_f32(acc), vget_high_f32(acc));
+ sum = vget_lane_f32(tmp, 0) + vget_lane_f32(tmp, 1);
+#endif
+
+ kCnt = i & 3;
+ while(kCnt > 0)
+ {
+ sum = sum + pG[i * n + k] * pG[(j + 0) * n + k];
+
+
+ kCnt--;
+ k++;
+ }
+
+ pG[j * n + i] -= sum;
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ for(j=i; j < n ; j++)
+ {
+ pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_cholesky_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float32_t invSqrtVj;
+ float32_t *pA,*pG;
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+
+ for(i=0 ; i < n ; i++)
+ {
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ for(k=0; k < i ; k++)
+ {
+ pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+ }
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0f/sqrtf(pG[i * n + i]);
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixChol group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c
new file mode 100644
index 0000000000..514ea95b9c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cholesky_f64.c
@@ -0,0 +1,122 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cholesky_f64.c
+ * Description: Floating-point Cholesky decomposition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point Cholesky decomposition of positive-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pDst points to the instance of the output floating-point matrix structure.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * If the matrix is ill conditioned or only semi-definite, then it is better using the LDL^t decomposition.
+ * The decomposition of A is returning a lower triangular matrix U such that A = U U^t
+ */
+
+
+arm_status arm_mat_cholesky_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ int i,j,k;
+ int n = pSrc->numRows;
+ float64_t invSqrtVj;
+ float64_t *pA,*pG;
+
+ pA = pSrc->pData;
+ pG = pDst->pData;
+
+
+ for(i=0 ; i < n ; i++)
+ {
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pA[j * n + i];
+
+ for(k=0; k < i ; k++)
+ {
+ pG[j * n + i] = pG[j * n + i] - pG[i * n + k] * pG[j * n + k];
+ }
+ }
+
+ if (pG[i * n + i] <= 0.0f)
+ {
+ return(ARM_MATH_DECOMPOSITION_FAILURE);
+ }
+
+ invSqrtVj = 1.0/sqrt(pG[i * n + i]);
+ for(j=i ; j < n ; j++)
+ {
+ pG[j * n + i] = pG[j * n + i] * invSqrtVj ;
+ }
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixChol group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
new file mode 100644
index 0000000000..38175514b0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f16.c
@@ -0,0 +1,932 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_mult_f16.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup CmplxMatrixMult
+ @{
+ */
+
+/**
+ @brief Floating-point Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_mat_cmplx_mult_f16 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+#include "arm_helium_utils.h"
+
+#define DONTCARE 0 /* inactive lane content */
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_2x2_mve(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ const uint16_t MATRIX_DIM = 2;
+ float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs0,vecColAOffs0,vecColAOffs1;
+ float16_t *pInA0 = pInA;
+ f16x8_t acc0, acc1;
+ f16x8_t vecB, vecA0, vecA1;
+ f16x8_t vecTmp;
+ uint16_t tmp;
+ static const uint16_t offsetB0[8] = { 0, 1,
+ MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+ 2, 3,
+ MATRIX_DIM * CMPLX_DIM + 2 , MATRIX_DIM * CMPLX_DIM + 3,
+ };
+
+
+ vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+ tmp = 0;
+ vecColAOffs0 = viwdupq_u16(tmp, 4, 1);
+
+ tmp = (CMPLX_DIM * MATRIX_DIM);
+ vecColAOffs1 = vecColAOffs0 + (uint16_t)(CMPLX_DIM * MATRIX_DIM);
+
+
+ pInB = (float16_t const *)pSrcB->pData;
+
+ vecA0 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs0);
+ vecA1 = vldrhq_gather_shifted_offset_f16(pInA0, vecColAOffs1);
+
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ acc0 = vcmulq(vecA0, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+ acc1 = vcmulq(vecA1, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+
+ /*
+ * Compute
+ * re0+re1 | im0+im1 | re0+re1 | im0+im1
+ * re2+re3 | im2+im3 | re2+re3 | im2+im3
+ */
+
+ vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc0);
+ vecTmp = vaddq(vecTmp, acc0);
+
+
+ *(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
+ *(float32_t *)(&pOut[0 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
+
+ vecTmp = (f16x8_t) vrev64q_s32((int32x4_t) acc1);
+ vecTmp = vaddq(vecTmp, acc1);
+
+ *(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM]) = ((f32x4_t)vecTmp)[0];
+ *(float32_t *)(&pOut[1 * CMPLX_DIM * MATRIX_DIM + CMPLX_DIM]) = ((f32x4_t)vecTmp)[2];
+
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_3x3_mve(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ const uint16_t MATRIX_DIM = 3;
+ float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs0;
+ float16_t *pInA0 = pInA;
+ float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
+ float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
+ f16x8_t acc0, acc1, acc2;
+ f16x8_t vecB, vecA0, vecA1, vecA2;
+ static const uint16_t offsetB0[8] = { 0, 1,
+ MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+ 2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
+ DONTCARE, DONTCARE
+ };
+
+
+ /* enable predication to disable upper half complex vector element */
+ mve_pred16_t p0 = vctp16q(MATRIX_DIM * CMPLX_DIM);
+
+ vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+ pInB = (float16_t const *)pSrcB->pData;
+
+ vecA0 = vldrhq_f16(pInA0);
+ vecA1 = vldrhq_f16(pInA1);
+ vecA2 = vldrhq_f16(pInA2);
+
+ vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+ acc0 = vcmulq(vecA0, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+ acc1 = vcmulq(vecA1, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+ acc2 = vcmulq(vecA2, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+ acc0 = vcmulq(vecA0, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+ acc1 = vcmulq(vecA1, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+ acc2 = vcmulq(vecA2, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrhq_gather_shifted_offset_z(pInB, vecColBOffs0, p0);
+
+ acc0 = vcmulq(vecA0, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA0, vecB);
+
+ acc1 = vcmulq(vecA1, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA1, vecB);
+
+ acc2 = vcmulq(vecA2, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA2, vecB);
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_cmplx_mult_f16_4x4_mve(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ const uint16_t MATRIX_DIM = 4;
+ float16_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs0;
+ float16_t *pInA0 = pInA;
+ float16_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM;
+ float16_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM;
+ float16_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM;
+ f16x8_t acc0, acc1, acc2, acc3;
+ f16x8_t vecB, vecA;
+ static const uint16_t offsetB0[8] = { 0, 1,
+ MATRIX_DIM * CMPLX_DIM, MATRIX_DIM * CMPLX_DIM + 1,
+ 2 * MATRIX_DIM * CMPLX_DIM, 2 * MATRIX_DIM * CMPLX_DIM + 1,
+ 3 * MATRIX_DIM * CMPLX_DIM, 3 * MATRIX_DIM * CMPLX_DIM + 1
+ };
+
+ vecColBOffs0 = vldrhq_u16((uint16_t const *) offsetB0);
+
+ pInB = (float16_t const *)pSrcB->pData;
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrhq_f16(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrhq_f16(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrhq_f16(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrhq_f16(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrhq_f16(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &pOut[0 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &pOut[1 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &pOut[2 * CMPLX_DIM * MATRIX_DIM]);
+ mve_cmplx_sum_intra_vec_f16(acc3, &pOut[3 * CMPLX_DIM * MATRIX_DIM]);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+arm_status arm_mat_cmplx_mult_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t const *pInB = (float16_t const *) pSrcB->pData; /* input data matrix pointer B */
+ float16_t const *pInA = (float16_t const *) pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ float16_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ uint16x8_t vecOffs, vecColBOffs;
+ uint32_t blkCnt,rowCnt; /* loop counters */
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*
+ * small squared matrix specialized routines
+ */
+ if (numRowsA == numColsB && numColsB == numColsA)
+ {
+ if (numRowsA == 1)
+ {
+ pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
+ pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
+ return (ARM_MATH_SUCCESS);
+ }
+ else if (numRowsA == 2)
+ return arm_mat_cmplx_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 3)
+ return arm_mat_cmplx_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_cmplx_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ vecColBOffs[0] = 0;
+ vecColBOffs[1] = 1;
+ vecColBOffs[2] = numColsB * CMPLX_DIM;
+ vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
+ vecColBOffs[4] = 2*numColsB * CMPLX_DIM;
+ vecColBOffs[5] = 2*(numColsB * CMPLX_DIM) + 1;
+ vecColBOffs[6] = 3*numColsB * CMPLX_DIM;
+ vecColBOffs[7] = 3*(numColsB * CMPLX_DIM) + 1;
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 2;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + 4 * numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (float16_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+ float16_t const *pInA0 = pInA;
+ float16_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
+ float16_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
+ float16_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
+ f16x8_t acc0, acc1, acc2, acc3;
+
+ acc0 = vdupq_n_f16(0.0f16);
+ acc1 = vdupq_n_f16(0.0f16);
+ acc2 = vdupq_n_f16(0.0f16);
+ acc3 = vdupq_n_f16(0.0f16);
+
+ pSrcA0Vec = (float16_t const *) pInA0;
+ pSrcA1Vec = (float16_t const *) pInA1;
+ pSrcA2Vec = (float16_t const *) pInA2;
+ pSrcA3Vec = (float16_t const *) pInA3;
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 4 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset_f16(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs , (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ blkCnt--;
+ }
+ /*
+ * Unsupported addressing mode compiler crash
+ */
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset_z_f16(pInB, vecOffs, p0);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vld1q(pSrcA2Vec);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vld1q(pSrcA3Vec);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ }
+
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &px[0 * CMPLX_DIM * numColsB + 0]);
+ mve_cmplx_sum_intra_vec_f16(acc1, &px[1 * CMPLX_DIM * numColsB + 0]);
+ mve_cmplx_sum_intra_vec_f16(acc2, &px[2 * CMPLX_DIM * numColsB + 0]);
+ mve_cmplx_sum_intra_vec_f16(acc3, &px[3 * CMPLX_DIM * numColsB + 0]);
+
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 4) * CMPLX_DIM;
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+
+ rowCnt = row & 3;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (float16_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ float16_t const *pSrcA0Vec;
+ float16_t const *pInA0 = pInA;
+ f16x8_t acc0;
+
+ acc0 = vdupq_n_f16(0.0f16);
+
+ pSrcA0Vec = (float16_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 4 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (4*numColsB * CMPLX_DIM));
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+
+ blkCnt--;
+ }
+
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset_z(pInB, vecOffs, p0);
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ }
+
+ mve_cmplx_sum_intra_vec_f16(acc0, &px[0]);
+
+
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (float16_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += numColsA * CMPLX_DIM;
+ rowCnt--;
+ }
+
+ /*
+ * set status as ARM_MATH_SUCCESS
+ */
+ status = ARM_MATH_SUCCESS;
+ }
+ /*
+ * Return to application
+ */
+ return (status);
+}
+#else
+
+arm_status arm_mat_cmplx_mult_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float16_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ _Float16 sumReal, sumImag; /* Accumulator */
+ _Float16 a1, b1, c1, d1;
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ _Float16 a0, b0, c0, d0;
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal = 0.0f16;
+ sumImag = 0.0f16;
+
+ /* Initiate pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+
+ /* Reading real part of complex matrix A */
+ a0 = *pIn1;
+
+ /* Reading real part of complex matrix B */
+ c0 = *pIn2;
+
+ /* Reading imaginary part of complex matrix A */
+ b0 = *(pIn1 + 1U);
+
+ /* Reading imaginary part of complex matrix B */
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ a0 = *(pIn1 );
+ c0 = *(pIn2 );
+ b0 = *(pIn1 + 1U);
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop count */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sumReal;
+ *px++ = sumImag;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + 2 * numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
new file mode 100644
index 0000000000..a166e2e63c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_f32.c
@@ -0,0 +1,1407 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_mult_f32.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup CmplxMatrixMult Complex Matrix Multiplication
+
+ Complex Matrix multiplication is only defined if the number of columns of the
+ first matrix equals the number of rows of the second matrix.
+ Multiplying an M x N matrix with an N x P matrix results
+ in an M x P matrix.
+ @par
+ When matrix size checking is enabled, the functions check:
+ - that the inner dimensions of pSrcA and pSrcB are equal;
+ - that the size of the output matrix equals the outer dimensions of pSrcA and pSrcB.
+ */
+
+
+/**
+ @addtogroup CmplxMatrixMult
+ @{
+ */
+
+/**
+ @brief Floating-point Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_2x2_mve(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0;
+ float32_t *pInA0 = pInA;
+ float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM2;
+ f32x4_t acc0, acc1;
+ f32x4_t vecB, vecA;
+
+ static const uint32_t offsetB0[4] = { 0, 1,
+ MATRIX_DIM2 * CMPLX_DIM, MATRIX_DIM2 * CMPLX_DIM + 1
+ };
+
+ vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
+
+ pInB = (float32_t const *)pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc1[1] + acc1[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = acc1[1] + acc1[3];
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_3x3_mve(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0, vecColBOffs1;
+ float32_t *pInA0 = pInA;
+ float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM3;
+ float32_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM3;
+ f32x4_t acc0, acc1, acc2;
+ f32x4_t vecB, vecA;
+ /* enable predication to disable upper half complex vector element */
+ mve_pred16_t p0 = vctp32q(CMPLX_DIM);
+
+ static const uint32_t offsetB0[4] = { 0, 1,
+ MATRIX_DIM3 * CMPLX_DIM, MATRIX_DIM3 * CMPLX_DIM + 1
+ };
+ static const uint32_t offsetB1[4] = { 2 * MATRIX_DIM3 * CMPLX_DIM, 2 * MATRIX_DIM3 * CMPLX_DIM + 1,
+ INACTIVELANE, INACTIVELANE
+ };
+
+ vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
+ vecColBOffs1 = vldrwq_u32((uint32_t const *) offsetB1);
+
+ pInB = (float32_t const *)pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecColBOffs1, p0);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = acc2[1] + acc2[3];
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_f32_4x4_mve(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0, vecColBOffs1;
+ float32_t *pInA0 = pInA;
+ float32_t *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM4;
+ float32_t *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM4;
+ float32_t *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM4;
+ f32x4_t acc0, acc1, acc2, acc3;
+ f32x4_t vecB, vecA;
+
+ static const uint32_t offsetB0[4] = { 0, 1,
+ MATRIX_DIM4 * CMPLX_DIM, MATRIX_DIM4 * CMPLX_DIM + 1
+ };
+ static const uint32_t offsetB1[4] = { 2 * MATRIX_DIM4 * CMPLX_DIM, 2 * MATRIX_DIM4 * CMPLX_DIM + 1,
+ 3 * MATRIX_DIM4 * CMPLX_DIM, 3 * MATRIX_DIM4 * CMPLX_DIM + 1
+ };
+
+ vecColBOffs0 = vldrwq_u32((uint32_t const *) offsetB0);
+ vecColBOffs1 = vldrwq_u32((uint32_t const *) offsetB1);
+
+ pInB = (float32_t const *)pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA3[4]);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA3[4]);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA3[4]);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+
+ vecA = vldrwq_f32(pInA0);
+ acc0 = vcmulq(vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA1);
+ acc1 = vcmulq(vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA2);
+ acc2 = vcmulq(vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(pInA3);
+ acc3 = vcmulq(vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_f32(&pInA0[4]);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA1[4]);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA2[4]);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+
+ vecA = vldrwq_f32(&pInA3[4]);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc0[0] + acc0[2];
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc0[1] + acc0[3];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc1[0] + acc1[2];
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc1[1] + acc1[3];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc2[0] + acc2[2];
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc2[1] + acc2[3];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = acc3[0] + acc3[2];
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = acc3[1] + acc3[3];
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_cmplx_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t const *pInB = (float32_t const *) pSrcB->pData; /* input data matrix pointer B */
+ float32_t const *pInA = (float32_t const *) pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ uint32x4_t vecOffs, vecColBOffs;
+ uint32_t blkCnt, rowCnt; /* loop counters */
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /*
+ * small squared matrix specialized routines
+ */
+ if (numRowsA == numColsB && numColsB == numColsA)
+ {
+ if (numRowsA == 1)
+ {
+ pOut[0] = pInA[0] * pInB[0] - pInA[1] * pInB[1];
+ pOut[1] = pInA[0] * pInB[1] + pInA[1] * pInB[0];
+ return (ARM_MATH_SUCCESS);
+ }
+ else if (numRowsA == 2)
+ return arm_mat_cmplx_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 3)
+ return arm_mat_cmplx_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_cmplx_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ vecColBOffs[0] = 0;
+ vecColBOffs[1] = 1;
+ vecColBOffs[2] = numColsB * CMPLX_DIM;
+ vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 2;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + 4 * numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (float32_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+ float32_t const *pInA0 = pInA;
+ float32_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
+ float32_t const *pInA2 = pInA1 + numColsA * CMPLX_DIM;
+ float32_t const *pInA3 = pInA2 + numColsA * CMPLX_DIM;
+ f32x4_t acc0, acc1, acc2, acc3;
+
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = (float32_t const *) pInA0;
+ pSrcA1Vec = (float32_t const *) pInA1;
+ pSrcA2Vec = (float32_t const *) pInA2;
+ pSrcA3Vec = (float32_t const *) pInA3;
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 4 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ blkCnt--;
+ }
+
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vcmlaq(acc1, vecA, vecB);
+ acc1 = vcmlaq_rot90(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec);
+ acc2 = vcmlaq(acc2, vecA, vecB);
+ acc2 = vcmlaq_rot90(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec);
+ acc3 = vcmlaq(acc3, vecA, vecB);
+ acc3 = vcmlaq_rot90(acc3, vecA, vecB);
+
+ }
+
+ px[0 * CMPLX_DIM * numColsB + 0] = acc0[0] + acc0[2];
+ px[0 * CMPLX_DIM * numColsB + 1] = acc0[1] + acc0[3];
+ px[1 * CMPLX_DIM * numColsB + 0] = acc1[0] + acc1[2];
+ px[1 * CMPLX_DIM * numColsB + 1] = acc1[1] + acc1[3];
+ px[2 * CMPLX_DIM * numColsB + 0] = acc2[0] + acc2[2];
+ px[2 * CMPLX_DIM * numColsB + 1] = acc2[1] + acc2[3];
+ px[3 * CMPLX_DIM * numColsB + 0] = acc3[0] + acc3[2];
+ px[3 * CMPLX_DIM * numColsB + 1] = acc3[1] + acc3[3];
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (float32_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 4) * CMPLX_DIM;
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+
+ rowCnt = row & 3;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (float32_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ float32_t const *pSrcA0Vec;
+ float32_t const *pInA0 = pInA;
+ f32x4_t acc0;
+
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = (float32_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 4 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+
+ blkCnt--;
+ }
+
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vcmlaq(acc0, vecA, vecB);
+ acc0 = vcmlaq_rot90(acc0, vecA, vecB);
+
+ }
+
+ px[0] = acc0[0] + acc0[2];
+ px[1] = acc0[1] + acc0[3];
+
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (float32_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += numColsA * CMPLX_DIM;
+ rowCnt--;
+ }
+
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+arm_status arm_mat_cmplx_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ float32_t sumReal1, sumImag1; /* accumulator */
+ float32_t a1, a1B,b1, b1B, c1, d1;
+ float32_t sumReal2, sumImag2; /* accumulator */
+
+
+ float32x4x2_t a0V, a1V;
+ float32x4_t accR0,accI0, accR1,accI1,tempR, tempI;
+ float32x2_t accum = vdup_n_f32(0);
+ float32_t *pIn1B = pSrcA->pData;
+
+ uint16_t col, i = 0U, j, rowCnt, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ float32_t sumReal1B, sumImag1B;
+ float32_t sumReal2B, sumImag2B;
+ float32_t *pxB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+
+ rowCnt = row >> 1;
+
+ /* Row loop */
+ while (rowCnt > 0U)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+ pxB = px + 2 * numColsB;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ while (col > 0U)
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal1 = 0.0f;
+ sumImag1 = 0.0f;
+ sumReal1B = 0.0f;
+ sumImag1B = 0.0f;
+
+ sumReal2 = 0.0f;
+ sumImag2 = 0.0f;
+ sumReal2B = 0.0f;
+ sumImag2B = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+ pIn1B = pIn1 + 2*numColsA;
+
+ accR0 = vdupq_n_f32(0.0);
+ accI0 = vdupq_n_f32(0.0);
+ accR1 = vdupq_n_f32(0.0);
+ accI1 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* Reading real part of complex matrix A */
+ a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ a1V = vld2q_f32(pIn1B); // load & separate real/imag pSrcA (de-interleave 2)
+
+ pIn1 += 8;
+ pIn1B += 8;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,0);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
+ pIn2 += 2 * numColsB;
+
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,1);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
+ pIn2 += 2 * numColsB;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,2);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
+ pIn2 += 2 * numColsB;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,3);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
+ pIn2 += 2 * numColsB;
+
+ accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
+ accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
+
+ accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
+ accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
+
+ accR1 = vmlaq_f32(accR1,a1V.val[0],tempR);
+ accR1 = vmlsq_f32(accR1,a1V.val[1],tempI);
+
+ accI1 = vmlaq_f32(accI1,a1V.val[1],tempR);
+ accI1 = vmlaq_f32(accI1,a1V.val[0],tempI);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
+ sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
+ sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(accR1), vget_high_f32(accR1));
+ sumReal1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(accI1), vget_high_f32(accI1));
+ sumImag1B += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 3;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a1 = *pIn1;
+ a1B = *pIn1B;
+
+ c1 = *pIn2;
+
+ b1 = *(pIn1 + 1U);
+ b1B = *(pIn1B + 1U);
+
+ d1 = *(pIn2 + 1U);
+
+ sumReal1 += a1 * c1;
+ sumImag1 += b1 * c1;
+
+ sumReal1B += a1B * c1;
+ sumImag1B += b1B * c1;
+
+ pIn1 += 2U;
+ pIn1B += 2U;
+ pIn2 += 2 * numColsB;
+
+ sumReal2 -= b1 * d1;
+ sumImag2 += a1 * d1;
+
+ sumReal2B -= b1B * d1;
+ sumImag2B += a1B * d1;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ sumReal1 += sumReal2;
+ sumImag1 += sumImag2;
+
+ sumReal1B += sumReal2B;
+ sumImag1B += sumImag2B;
+
+ /* Store the result in the destination buffer */
+ *px++ = sumReal1;
+ *px++ = sumImag1;
+ *pxB++ = sumReal1B;
+ *pxB++ = sumImag1B;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ /* Update the pointer pInA to point to the starting address of the next 2 row */
+ i = i + 2*numColsB;
+ pInA = pInA + 4 * numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+ }
+
+ rowCnt = row & 1;
+ while (rowCnt > 0U)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ while (col > 0U)
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal1 = 0.0f;
+ sumImag1 = 0.0f;
+
+ sumReal2 = 0.0f;
+ sumImag2 = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+
+ accR0 = vdupq_n_f32(0.0);
+ accI0 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* Reading real part of complex matrix A */
+ a0V = vld2q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ pIn1 += 8;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,0);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,0);
+ pIn2 += 2 * numColsB;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,1);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,1);
+ pIn2 += 2 * numColsB;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,2);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,2);
+ pIn2 += 2 * numColsB;
+
+ tempR = vsetq_lane_f32(*pIn2,tempR,3);
+ tempI = vsetq_lane_f32(*(pIn2 + 1U),tempI,3);
+ pIn2 += 2 * numColsB;
+
+ accR0 = vmlaq_f32(accR0,a0V.val[0],tempR);
+ accR0 = vmlsq_f32(accR0,a0V.val[1],tempI);
+
+ accI0 = vmlaq_f32(accI0,a0V.val[1],tempR);
+ accI0 = vmlaq_f32(accI0,a0V.val[0],tempI);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(accR0), vget_high_f32(accR0));
+ sumReal1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(accI0), vget_high_f32(accI0));
+ sumImag1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 3;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a1 = *pIn1;
+ c1 = *pIn2;
+
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ sumReal1 += a1 * c1;
+ sumImag1 += b1 * c1;
+
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ sumReal2 -= b1 * d1;
+ sumImag2 += a1 * d1;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ sumReal1 += sumReal2;
+ sumImag1 += sumImag2;
+
+ /* Store the result in the destination buffer */
+ *px++ = sumReal1;
+ *px++ = sumImag1;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ }
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + 2 * numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_cmplx_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ float32_t sumReal, sumImag; /* Accumulator */
+ float32_t a1, b1, c1, d1;
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ float32_t a0, b0, c0, d0;
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal = 0.0f;
+ sumImag = 0.0f;
+
+ /* Initiate pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+
+ /* Reading real part of complex matrix A */
+ a0 = *pIn1;
+
+ /* Reading real part of complex matrix B */
+ c0 = *pIn2;
+
+ /* Reading imaginary part of complex matrix A */
+ b0 = *(pIn1 + 1U);
+
+ /* Reading imaginary part of complex matrix B */
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ a0 = *(pIn1 );
+ c0 = *(pIn2 );
+ b0 = *(pIn1 + 1U);
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a0 * c0;
+ sumImag += b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b0 * d0;
+ sumImag += a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop count */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += a1 * c1;
+ sumImag += b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= b1 * d1;
+ sumImag += a1 * d1;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sumReal;
+ *px++ = sumImag;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + 2 * numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
new file mode 100644
index 0000000000..55e5a979a1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q15.c
@@ -0,0 +1,595 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cmplx_mat_mult_q15.c
+ * Description: Q15 complex matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup CmplxMatrixMult
+ @{
+ */
+
+/**
+ @brief Q15 Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @param[in] pScratch points to an array for storing intermediate results
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Conditions for optimum performance
+ Input, output and state buffers should be aligned by 32-bit
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator. The inputs to the
+ multiplications are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
+ truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+arm_status arm_mat_cmplx_mult_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pScratch)
+{
+ q15_t const *pInA = (q15_t const *) pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t const *pInB = (q15_t const *) pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t const *pInB2;
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
+ uint32_t blkCnt; /* loop counters */
+ uint16x8_t vecOffs, vecColBOffs;
+ arm_status status; /* Status of matrix multiplication */
+ (void)pScratch;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ vecColBOffs[0] = 0;
+ vecColBOffs[1] = 1;
+ vecColBOffs[2] = numColsB * CMPLX_DIM;
+ vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
+ vecColBOffs[4] = 2 * numColsB * CMPLX_DIM;
+ vecColBOffs[5] = 2 * (numColsB * CMPLX_DIM) + 1;
+ vecColBOffs[6] = 3 * numColsB * CMPLX_DIM;
+ vecColBOffs[7] = 3 * (numColsB * CMPLX_DIM) + 1;
+
+ /*
+ * Reset the variables for the usage in the following multiplication process
+ */
+ i = 0;
+ row = numRowsA;
+ px = pDst->pData;
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ while (row > 0u)
+ {
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB >> 1;
+ j = 0;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ q15_t const *pSrcAVec;
+ //, *pSrcBVec, *pSrcB2Vec;
+ q15x8_t vecA, vecB, vecB2;
+ q63_t acc0, acc1, acc2, acc3;
+
+ /*
+ * Initiate the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pInA = pSrcA->pData + i;
+ pInB = pSrcB->pData + j;
+ pInB2 = pInB + CMPLX_DIM;
+
+ j += 2 * CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+
+ /*
+ * Initiate the pointers
+ * - current Matrix A rows
+ * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
+ */
+ pSrcAVec = (q15_t const *) pInA;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ vecOffs = vecColBOffs;
+
+
+ blkCnt = (numColsA * CMPLX_DIM) >> 3;
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 8;
+ vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+
+ acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
+ vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+ acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
+ acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+
+ vecA = vldrhq_z_s16(pSrcAVec, p0);
+
+ acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
+ vecB2 = vldrhq_gather_shifted_offset(pInB2, vecOffs);
+
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+ acc2 = vmlsldavaq_s16(acc2, vecA, vecB2);
+ acc3 = vmlaldavaxq_s16(acc3, vecA, vecB2);
+
+ }
+ /*
+ * Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
+ */
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc2, 15);
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc3, 15);
+ }
+
+ col = numColsB & 1;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+
+ q15_t const *pSrcAVec;
+ //, *pSrcBVec, *pSrcB2Vec;
+ q15x8_t vecA, vecB;
+ q63_t acc0, acc1;
+
+ /*
+ * Initiate the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pInA = pSrcA->pData + i;
+ pInB = pSrcB->pData + j;
+
+ j += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+
+ /*
+ * Initiate the pointers
+ * - current Matrix A rows
+ * - 2 x consecutive Matrix B' rows (j increment is 2 x numRowsB)
+ */
+ pSrcAVec = (q15_t const *) pInA;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
+
+ vecOffs = vecColBOffs;
+
+
+
+ blkCnt = (numColsA * CMPLX_DIM) >> 3;
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 8;
+ vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+
+ acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
+ /*
+ * move Matrix B read offsets, 4 rows down
+ */
+ vecOffs = vaddq_n_u16(vecOffs, (uint16_t) (numColsB * 4 * CMPLX_DIM));
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecB = vldrhq_gather_shifted_offset(pInB, vecOffs);
+ vecA = vldrhq_z_s16(pSrcAVec, p0);
+
+ acc0 = vmlsldavaq_s16(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq_s16(acc1, vecA, vecB);
+
+ }
+ /*
+ * Convert to 1.15, Store the results (1 x 2 block) in the destination buffer
+ */
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc0, 15);
+ *px++ = (q15_t)MVE_ASRL_SAT16(acc1, 15);
+
+ }
+
+ i = i + numColsA * CMPLX_DIM;
+
+ /*
+ * Decrement the row loop counter
+ */
+ row--;
+ }
+
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_cmplx_mult_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pScratch)
+{
+ q15_t *pSrcBT = pScratch; /* input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ q63_t sumReal, sumImag; /* accumulator */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#if defined (ARM_MATH_DSP)
+ q31_t prod1, prod2;
+ q31_t pSourceA, pSourceB;
+#else
+ q15_t a, b, c, d;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose */
+ do
+ {
+ /* The pointer px is set to starting address of column being processed */
+ px = pSrcBT + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Apply loop unrolling and exchange the columns with row elements */
+ col = numColsB >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ a second loop below computes the remaining 1 to 3 samples. */
+ while (col > 0U)
+ {
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB * 2;
+
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB * 2;
+
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB * 2;
+
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB * 2;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ col = numColsB % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ col = numColsB;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read two elements from row */
+ write_q15x2 (px, read_q15x2_ia (&pInB));
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB * 2;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i = i + 2U;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Reset variables for usage in following multiplication process */
+ row = numRowsA;
+ i = 0U;
+ px = pDst->pData;
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
+ pInB = pSrcBT;
+
+ /* column loop */
+ do
+ {
+ /* Set variable sum, that acts as accumulator, to zero */
+ sumReal = 0;
+ sumImag = 0;
+
+ /* Initiate pointer pInA to point to starting address of column being processed */
+ pInA = pSrcA->pData + i * 2;
+
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 1U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+#if defined (ARM_MATH_DSP)
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* read real and imag values from pSrcA buffer */
+ a = *pInA;
+ b = *(pInA + 1U);
+ /* read real and imag values from pSrcB buffer */
+ c = *pInB;
+ d = *(pInB + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q31_t) a *c;
+ sumImag += (q31_t) a *d;
+ sumReal -= (q31_t) b *d;
+ sumImag += (q31_t) b *c;
+
+ /* read next real and imag values from pSrcA buffer */
+ a = *(pInA + 2U);
+ b = *(pInA + 3U);
+ /* read next real and imag values from pSrcB buffer */
+ c = *(pInB + 2U);
+ d = *(pInB + 3U);
+
+ /* update pointer */
+ pInA += 4U;
+
+ /* Multiply and Accumlates */
+ sumReal += (q31_t) a * c;
+ sumImag += (q31_t) a * d;
+ sumReal -= (q31_t) b * d;
+ sumImag += (q31_t) b * c;
+ /* update pointer */
+ pInB += 4U;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* process odd column samples */
+ if ((numColsA & 0x1U) > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+#if defined (ARM_MATH_DSP)
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA = read_q15x2_ia ((q15_t **) &pInA);
+ pSourceB = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumlates */
+#ifdef ARM_MATH_BIG_ENDIAN
+ prod1 = -__SMUSD(pSourceA, pSourceB);
+#else
+ prod1 = __SMUSD(pSourceA, pSourceB);
+#endif
+ prod2 = __SMUADX(pSourceA, pSourceB);
+ sumReal += (q63_t) prod1;
+ sumImag += (q63_t) prod2;
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ a = *pInA++;
+ b = *pInA++;
+ c = *pInB++;
+ d = *pInB++;
+
+ /* Multiply and Accumlates */
+ sumReal += (q31_t) a * c;
+ sumImag += (q31_t) a * d;
+ sumReal -= (q31_t) b * d;
+ sumImag += (q31_t) b * c;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ }
+
+ /* Saturate and store result in destination buffer */
+ *px++ = (q15_t) (__SSAT(sumReal >> 15, 16));
+ *px++ = (q15_t) (__SSAT(sumImag >> 15, 16));
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ i = i + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
new file mode 100644
index 0000000000..3462783ec6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_mult_q31.c
@@ -0,0 +1,1061 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_mult_q31.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup CmplxMatrixMult
+ @{
+ */
+
+/**
+ @brief Q31 Complex matrix multiplication.
+ @param[in] pSrcA points to first input complex matrix structure
+ @param[in] pSrcB points to second input complex matrix structure
+ @param[out] pDst points to output complex matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit. There is no saturation
+ on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ distorts the result. The input signals should be scaled down to avoid intermediate
+ overflows. The input is thus scaled down by log2(numColsA) bits
+ to avoid overflows, as a total of numColsA additions are performed internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_2x2_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM2;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecB, vecA;
+
+ static const uint32_t offsetB0[4] = {
+ 0, 1,
+ MATRIX_DIM2 * CMPLX_DIM, MATRIX_DIM2 * CMPLX_DIM + 1
+ };
+
+ vecColBOffs0 = vldrwq_u32(offsetB0);
+
+ pInB = (q31_t const *) pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc3, 31);
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ pOut += CMPLX_DIM;
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM2 + 1] = (q31_t) asrl(acc3, 31);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_3x3_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0, vecColBOffs1;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM3;
+ q31_t const *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM3;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecB, vecB1, vecA;
+ /*
+ * enable predication to disable upper half complex vector element
+ */
+ mve_pred16_t p0 = vctp32q(CMPLX_DIM);
+
+ static const uint32_t offsetB0[4] = {
+ 0, 1,
+ MATRIX_DIM3 * CMPLX_DIM, MATRIX_DIM3 * CMPLX_DIM + 1
+ };
+ static const uint32_t offsetB1[4] = {
+ 2 * MATRIX_DIM3 * CMPLX_DIM, 2 * MATRIX_DIM3 * CMPLX_DIM + 1,
+ INACTIVELANE, INACTIVELANE
+ };
+
+ vecColBOffs0 = vldrwq_u32(offsetB0);
+ vecColBOffs1 = vldrwq_u32(offsetB1);
+
+ pInB = (q31_t const *) pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA0[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_z_s32(&pInA1[4], p0);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA2[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA0[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_z_s32(&pInA1[4], p0);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA2[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA0[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_z_s32(&pInA1[4], p0);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_z_s32(&pInA2[4], p0);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM3 + 1] = (q31_t) asrl(acc1, 31);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_cmplx_mult_q31_4x4_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t const *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs0, vecColBOffs1;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + CMPLX_DIM * MATRIX_DIM4;
+ q31_t const *pInA2 = pInA1 + CMPLX_DIM * MATRIX_DIM4;
+ q31_t const *pInA3 = pInA2 + CMPLX_DIM * MATRIX_DIM4;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecB, vecB1, vecA;
+
+ static const uint32_t offsetB0[4] = {
+ 0, 1,
+ MATRIX_DIM4 * CMPLX_DIM, MATRIX_DIM4 * CMPLX_DIM + 1
+ };
+ static const uint32_t offsetB1[4] = {
+ 2 * MATRIX_DIM4 * CMPLX_DIM, 2 * MATRIX_DIM4 * CMPLX_DIM + 1,
+ 3 * MATRIX_DIM4 * CMPLX_DIM, 3 * MATRIX_DIM4 * CMPLX_DIM + 1
+ };
+
+ vecColBOffs0 = vldrwq_u32(offsetB0);
+ vecColBOffs1 = vldrwq_u32(offsetB1);
+
+ pInB = (q31_t const *) pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA0[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA1[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA3);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA2[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA3[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA0[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA1[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA3);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA2[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA3[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+ pOut += CMPLX_DIM;
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA0[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA1[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA3);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA2[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA3[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+ pOut += CMPLX_DIM;
+
+ /*
+ * move to next B column
+ */
+ pInB = pInB + CMPLX_DIM;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecColBOffs0);
+ vecB1 = vldrwq_gather_shifted_offset(pInB, vecColBOffs1);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA1);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA0[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA1[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[0 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[1 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+
+ vecA = vldrwq_s32(pInA2);
+ acc0 = vmlsldavq_s32(vecA, vecB);
+ acc1 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(pInA3);
+ acc2 = vmlsldavq_s32(vecA, vecB);
+ acc3 = vmlaldavxq_s32(vecA, vecB);
+
+ vecA = vldrwq_s32(&pInA2[4]);
+ acc0 = vmlsldavaq_s32(acc0, vecA, vecB1);
+ acc1 = vmlaldavaxq_s32(acc1, vecA, vecB1);
+
+ vecA = vldrwq_s32(&pInA3[4]);
+ acc2 = vmlsldavaq_s32(acc2, vecA, vecB1);
+ acc3 = vmlaldavaxq_s32(acc3, vecA, vecB1);
+
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc0, 31);
+ pOut[2 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc1, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 0] = (q31_t) asrl(acc2, 31);
+ pOut[3 * CMPLX_DIM * MATRIX_DIM4 + 1] = (q31_t) asrl(acc3, 31);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_cmplx_mult_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t const *pInB = (q31_t const *) pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = (q31_t const *) pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ uint32x4_t vecOffs, vecColBOffs;
+ uint32_t blkCnt, rowCnt; /* loop counters */
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /*
+ * small squared matrix specialized routines
+ */
+ if (numRowsA == numColsB && numColsB == numColsA)
+ {
+ if (numRowsA == 1)
+ {
+ q63_t sumReal = (q63_t) pInA[0] * pInB[0];
+ sumReal -= (q63_t) pInA[1] * pInB[1];
+
+ q63_t sumImag = (q63_t) pInA[0] * pInB[1];
+ sumImag += (q63_t) pInA[1] * pInB[0];
+
+ /* Store result in destination buffer */
+ pOut[0] = (q31_t) clip_q63_to_q31(sumReal >> 31);
+ pOut[1] = (q31_t) clip_q63_to_q31(sumImag >> 31);
+ return (ARM_MATH_SUCCESS);
+ }
+ else if (numRowsA == 2)
+ return arm_mat_cmplx_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 3)
+ return arm_mat_cmplx_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_cmplx_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ vecColBOffs[0] = 0;
+ vecColBOffs[1] = 1;
+ vecColBOffs[2] = numColsB * CMPLX_DIM;
+ vecColBOffs[3] = (numColsB * CMPLX_DIM) + 1;
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 1;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + 2 * numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q31_t const *pSrcA0Vec, *pSrcA1Vec;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + numColsA * CMPLX_DIM;
+ q63_t acc0, acc1, acc2, acc3;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+ pSrcA1Vec = (q31_t const *) pInA1;
+
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 2 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 2 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vmlsldavaq(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq(acc1, vecA, vecB);
+
+
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+
+ acc2 = vmlsldavaq(acc2, vecA, vecB);
+ acc3 = vmlaldavaxq(acc3, vecA, vecB);
+
+
+ blkCnt--;
+ }
+
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+
+ /*
+ * move Matrix B read offsets, 2 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vmlsldavaq(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec);
+ acc2 = vmlsldavaq(acc2, vecA, vecB);
+ acc3 = vmlaldavaxq(acc3, vecA, vecB);
+
+
+ }
+
+ px[0 * CMPLX_DIM * numColsB + 0] = (q31_t) clip_q63_to_q31(acc0 >> 31);
+ px[0 * CMPLX_DIM * numColsB + 1] = (q31_t) clip_q63_to_q31(acc1 >> 31);
+ px[1 * CMPLX_DIM * numColsB + 0] = (q31_t) clip_q63_to_q31(acc2 >> 31);
+ px[1 * CMPLX_DIM * numColsB + 1] = (q31_t) clip_q63_to_q31(acc3 >> 31);
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (q31_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 2) * CMPLX_DIM;
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+
+ rowCnt = row & 1;
+ while (rowCnt > 0u)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i * CMPLX_DIM;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *) pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q31_t const *pSrcA0Vec;
+ q31_t const *pInA0 = pInA;
+ q63_t acc0,acc1;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ /*
+ * process 1 x 2 block output
+ */
+ blkCnt = (numColsA * CMPLX_DIM) >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /*
+ * move Matrix B read offsets, 2 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vmlsldavaq(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq(acc1, vecA, vecB);
+
+
+ blkCnt--;
+ }
+
+
+ /*
+ * tail
+ */
+ blkCnt = (numColsA * CMPLX_DIM) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+
+ /*
+ * move Matrix B read offsets, 2 rows down
+ */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 2 * CMPLX_DIM);
+
+ vecA = vld1q(pSrcA0Vec);
+
+
+ acc0 = vmlsldavaq(acc0, vecA, vecB);
+ acc1 = vmlaldavaxq(acc1, vecA, vecB);
+
+
+ }
+
+ px[0] = (q31_t) clip_q63_to_q31(acc0 >> 31);
+ px[1] = (q31_t) clip_q63_to_q31(acc1 >> 31);
+
+
+ px += CMPLX_DIM;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (q31_t const *) pSrcB->pData + (numColsB - col) * CMPLX_DIM;
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += numColsA * CMPLX_DIM;
+ rowCnt--;
+ }
+
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_cmplx_mult_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ q63_t sumReal, sumImag; /* Accumulator */
+ q31_t a1, b1, c1, d1;
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t a0, b0, c0, d0;
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + 2 * i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sumReal = 0.0;
+ sumImag = 0.0;
+
+ /* Initiate pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+
+ /* Reading real part of complex matrix A */
+ a0 = *pIn1;
+
+ /* Reading real part of complex matrix B */
+ c0 = *pIn2;
+
+ /* Reading imaginary part of complex matrix A */
+ b0 = *(pIn1 + 1U);
+
+ /* Reading imaginary part of complex matrix B */
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q63_t) a0 * c0;
+ sumImag += (q63_t) b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= (q63_t) b0 * d0;
+ sumImag += (q63_t) a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
+
+ a0 = *(pIn1 );
+ c0 = *(pIn2 );
+ b0 = *(pIn1 + 1U);
+ d0 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q63_t) a0 * c0;
+ sumImag += (q63_t) b0 * c0;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= (q63_t) b0 * d0;
+ sumImag += (q63_t) a0 * d0;
+
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
+
+ /* Decrement loop count */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ a1 = *(pIn1 );
+ c1 = *(pIn2 );
+ b1 = *(pIn1 + 1U);
+ d1 = *(pIn2 + 1U);
+
+ /* Multiply and Accumlates */
+ sumReal += (q63_t) a1 * c1;
+ sumImag += (q63_t) b1 * c1;
+
+ /* update pointers */
+ pIn1 += 2U;
+ pIn2 += 2 * numColsB;
+
+ /* Multiply and Accumlates */
+ sumReal -= (q63_t) b1 * d1;
+ sumImag += (q63_t) a1 * d1;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = (q31_t) clip_q63_to_q31(sumReal >> 31);
+ *px++ = (q31_t) clip_q63_to_q31(sumImag >> 31);
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ j++;
+ pIn2 = pSrcB->pData + 2U * j;
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + 2 * numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c
new file mode 100644
index 0000000000..9cc2c24062
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_trans_f16.c
+ * Description: Floating-point complex matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixComplexTrans
+ @{
+ */
+
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_f16(const arm_matrix_instance_f16 * pSrc, arm_matrix_instance_f16 * pDst)
+{
+ return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
+ pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
+}
+
+#else
+arm_status arm_mat_cmplx_trans_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ float16_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+ uint16_t col, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + CMPLX_DIM * i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while (col > 0U)
+ {
+ /* Read and store the input element in the destination */
+ px[0] = *pIn++; // real
+ px[1] = *pIn++; // imag
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += CMPLX_DIM * nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c
new file mode 100644
index 0000000000..222f349e44
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_f32.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_trans_f32.c
+ * Description: Floating-point complex matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixComplexTrans Complex Matrix Transpose
+
+ Tranposes a complex matrix.
+
+ Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+/**
+ @addtogroup MatrixComplexTrans
+ @{
+ */
+
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_f32(const arm_matrix_instance_f32 * pSrc, arm_matrix_instance_f32 * pDst)
+{
+ return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
+ pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
+}
+
+#else
+arm_status arm_mat_cmplx_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+ uint16_t col, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + CMPLX_DIM * i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while (col > 0U)
+ {
+ /* Read and store the input element in the destination */
+ px[0] = *pIn++; // real
+ px[1] = *pIn++; // imag
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += CMPLX_DIM * nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
new file mode 100644
index 0000000000..8a3724ee6e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q15.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_trans_q31.c
+ * Description: Q15 complex matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixComplexTrans
+ @{
+ */
+
+/**
+ @brief Q15 complex matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_cmplx_trans_q15(const arm_matrix_instance_q15 * pSrc, arm_matrix_instance_q15 * pDst)
+{
+ return arm_mat_cmplx_trans_16bit(pSrc->numRows, pSrc->numCols, (uint16_t *) pSrc->pData,
+ pDst->numRows, pDst->numCols, (uint16_t *) pDst->pData);
+}
+
+
+#else
+arm_status arm_mat_cmplx_trans_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of nRows */
+ uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+ uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer pOut is set to starting address of the column being processed */
+ pOut = pDst->pData + CMPLX_DIM * i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while (col > 0U)
+ {
+ /* Read and store the input element in the destination */
+ pOut[0] = *pSrcA++; //real
+ pOut[1] = *pSrcA++; //imag
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += CMPLX_DIM *nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
new file mode 100644
index 0000000000..a8612a30eb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_cmplx_trans_q31.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_cmplx_trans_q31.c
+ * Description: Q31 complex matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+
+/**
+ @addtogroup MatrixComplexTrans
+ @{
+ */
+
+/**
+ @brief Q31 complex matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+arm_status arm_mat_cmplx_trans_q31(const arm_matrix_instance_q31 * pSrc, arm_matrix_instance_q31 * pDst)
+{
+ return arm_mat_cmplx_trans_32bit(pSrc->numRows, pSrc->numCols, (uint32_t *) pSrc->pData,
+ pDst->numRows, pDst->numCols, (uint32_t *) pDst->pData);
+}
+
+
+#else
+arm_status arm_mat_cmplx_trans_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of nRows */
+ uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+ uint16_t col, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + CMPLX_DIM * i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while (col > 0U)
+ {
+ /* Read and store the input element in the destination */
+ px[0] = *pIn++; // real
+ px[1] = *pIn++; // imag
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += CMPLX_DIM * nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ }
+ while (row > 0U); /* row loop end */
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c
new file mode 100644
index 0000000000..306e9849dd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f16.c
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_init_f16.c
+ * Description: Floating-point matrix initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInit
+ @{
+ */
+
+/**
+ @brief Floating-point matrix initialization.
+ @param[in,out] S points to an instance of the floating-point matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
+
+void arm_mat_init_f16(
+ arm_matrix_instance_f16 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ float16_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ @} end of MatrixInit group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
new file mode 100644
index 0000000000..cfd02ecca7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_f32.c
@@ -0,0 +1,76 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_init_f32.c
+ * Description: Floating-point matrix initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixInit Matrix Initialization
+
+ Initializes the underlying matrix data structure.
+ The functions set the numRows,
+ numCols, and pData fields
+ of the matrix data structure.
+ */
+
+/**
+ @addtogroup MatrixInit
+ @{
+ */
+
+/**
+ @brief Floating-point matrix initialization.
+ @param[in,out] S points to an instance of the floating-point matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
+
+void arm_mat_init_f32(
+ arm_matrix_instance_f32 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ float32_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ @} end of MatrixInit group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
new file mode 100644
index 0000000000..33942b50bb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q15.c
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_init_q15.c
+ * Description: Q15 matrix initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixInit
+ @{
+ */
+
+/**
+ @brief Q15 matrix initialization.
+ @param[in,out] S points to an instance of the floating-point matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
+
+void arm_mat_init_q15(
+ arm_matrix_instance_q15 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ q15_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ @} end of MatrixInit group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
new file mode 100644
index 0000000000..dd39910f70
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_init_q31.c
@@ -0,0 +1,72 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_init_q31.c
+ * Description: Q31 matrix initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixInit Matrix Initialization
+
+ */
+
+/**
+ @addtogroup MatrixInit
+ @{
+ */
+
+/**
+ @brief Q31 matrix initialization.
+ @param[in,out] S points to an instance of the Q31 matrix structure
+ @param[in] nRows number of rows in the matrix
+ @param[in] nColumns number of columns in the matrix
+ @param[in] pData points to the matrix data array
+ @return none
+ */
+
+void arm_mat_init_q31(
+ arm_matrix_instance_q31 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ q31_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ @} end of MatrixInit group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
new file mode 100644
index 0000000000..4d6b9b07a0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f16.c
@@ -0,0 +1,891 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_inverse_f16.c
+ * Description: Floating-point matrix inverse
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ @brief Floating-point matrix inverse.
+ @param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_inverse_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+ float16_t *pTmpA, *pTmpB;
+
+ _Float16 in = 0.0f16; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+ uint32_t blkCnt;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+ || (pSrc->numRows != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /*
+ * Working pointer for destination matrix
+ */
+ pOutT1 = pOut;
+ /*
+ * Loop over the number of rows
+ */
+ rowCnt = numRows;
+ /*
+ * Making the destination matrix as identity matrix
+ */
+ while (rowCnt > 0U)
+ {
+ /*
+ * Writing all zeroes in lower triangle of the destination matrix
+ */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f16;
+ j--;
+ }
+ /*
+ * Writing all ones in the diagonal of the destination matrix
+ */
+ *pOutT1++ = 1.0f16;
+ /*
+ * Writing all zeroes in upper triangle of the destination matrix
+ */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f16;
+ j--;
+ }
+ /*
+ * Decrement the loop counter
+ */
+ rowCnt--;
+ }
+
+ /*
+ * Loop over the number of columns of the input matrix.
+ * All the elements in each column are processed by the row operations
+ */
+ loopCnt = numCols;
+ /*
+ * Index modifier to navigate through the columns
+ */
+ l = 0U;
+ while (loopCnt > 0U)
+ {
+ /*
+ * Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular.
+ */
+
+ /*
+ * Working pointer for the input matrix that points
+ * * to the pivot element of the particular row
+ */
+ pInT1 = pIn + (l * numCols);
+ /*
+ * Working pointer for the destination matrix that points
+ * * to the pivot element of the particular row
+ */
+ pOutT1 = pOut + (l * numCols);
+ /*
+ * Temporary variable to hold the pivot value
+ */
+ in = *pInT1;
+
+
+ /*
+ * Check if the pivot element is zero
+ */
+ if (*pInT1 == 0.0f16)
+ {
+ /*
+ * Loop over the number rows present below
+ */
+ for (i = 1U; i < numRows-l; i++)
+ {
+ /*
+ * Update the input and destination pointers
+ */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+ /*
+ * Check if there is a non zero pivot element to
+ * * replace in the rows below
+ */
+ if (*pInT2 != 0.0f16)
+ {
+ f16x8_t vecA, vecB;
+ /*
+ * Loop over number of columns
+ * * to the right of the pilot element
+ */
+ pTmpA = pInT1;
+ pTmpB = pInT2;
+ blkCnt = (numCols - l) >> 3;
+ while (blkCnt > 0U)
+ {
+
+ vecA = vldrhq_f16(pTmpA);
+ vecB = vldrhq_f16(pTmpB);
+ vstrhq_f16(pTmpB, vecA);
+ vstrhq_f16(pTmpA, vecB);
+
+ pTmpA += 8;
+ pTmpB += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numCols - l) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecA = vldrhq_f16(pTmpA);
+ vecB = vldrhq_f16(pTmpB);
+ vstrhq_p_f16(pTmpB, vecA, p0);
+ vstrhq_p_f16(pTmpA, vecB, p0);
+ }
+
+ pInT1 += numCols - l;
+ pInT2 += numCols - l;
+ pTmpA = pOutT1;
+ pTmpB = pOutT2;
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+
+ vecA = vldrhq_f16(pTmpA);
+ vecB = vldrhq_f16(pTmpB);
+ vstrhq_f16(pTmpB, vecA);
+ vstrhq_f16(pTmpA, vecB);
+ pTmpA += 8;
+ pTmpB += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecA = vldrhq_f16(pTmpA);
+ vecB = vldrhq_f16(pTmpB);
+ vstrhq_p_f16(pTmpB, vecA, p0);
+ vstrhq_p_f16(pTmpA, vecB, p0);
+ }
+
+ pOutT1 += numCols;
+ pOutT2 += numCols;
+ /*
+ * Flag to indicate whether exchange is done or not
+ */
+ flag = 1U;
+
+ /*
+ * Break after exchange is done
+ */
+ break;
+ }
+
+ }
+ }
+
+ /*
+ * Update the status if the matrix is singular
+ */
+ if ((flag != 1U) && (in == 0.0f16))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /*
+ * Points to the pivot row of input and destination matrices
+ */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /*
+ * Temporary pointers to the pivot row pointers
+ */
+ pInT1 = pPivotRowIn;
+ pOutT1 = pPivotRowDst;
+
+ /*
+ * Pivot element of the row
+ */
+ in = *(pIn + (l * numCols));
+
+ pTmpA = pInT1;
+
+ f16x8_t invIn = vdupq_n_f16(1.0f16 / in);
+
+ blkCnt = (numCols - l) >> 3;
+ f16x8_t vecA;
+ while (blkCnt > 0U)
+ {
+ *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA * invIn;
+ pTmpA += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = (numCols - l) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+
+ vecA = vldrhq_f16(pTmpA);
+ vecA = vecA * invIn;
+ vstrhq_p_f16(pTmpA, vecA, p0);
+ }
+
+ pInT1 += numCols - l;
+ /*
+ * Loop over number of columns
+ * * to the right of the pilot element
+ */
+
+ pTmpA = pOutT1;
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ *(f16x8_t *) pTmpA = *(f16x8_t *) pTmpA *invIn;
+ pTmpA += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecA = vldrhq_f16(pTmpA);
+ vecA = vecA * invIn;
+ vstrhq_p_f16(pTmpA, vecA, p0);
+ }
+
+ pOutT1 += numCols;
+
+ /*
+ * Replace the rows with the sum of that row and a multiple of row i
+ * * so that each new element in column i above row i is zero.
+ */
+
+ /*
+ * Temporary pointers for input and destination matrices
+ */
+ pInT1 = pIn;
+ pOutT1 = pOut;
+
+ for (i = 0U; i < numRows; i++)
+ {
+ /*
+ * Check for the pivot element
+ */
+ if (i == l)
+ {
+ /*
+ * If the processing element is the pivot element,
+ * only the columns to the right are to be processed
+ */
+ pInT1 += numCols - l;
+ pOutT1 += numCols;
+ }
+ else
+ {
+ /*
+ * Element of the reference row
+ */
+
+ /*
+ * Working pointers for input and destination pivot rows
+ */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+ /*
+ * Loop over the number of columns to the right of the pivot element,
+ * to replace the elements in the input matrix
+ */
+
+ in = *pInT1;
+ f16x8_t tmpV = vdupq_n_f16(in);
+
+ blkCnt = (numCols - l) >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vec1, vec2;
+ /*
+ * Replace the element by the sum of that row
+ * and a multiple of the reference row
+ */
+ vec1 = vldrhq_f16(pInT1);
+ vec2 = vldrhq_f16(pPRT_in);
+ vec1 = vfmsq_f16(vec1, tmpV, vec2);
+ vstrhq_f16(pInT1, vec1);
+ pPRT_in += 8;
+ pInT1 += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numCols - l) & 7;
+ if (blkCnt > 0U)
+ {
+ f16x8_t vec1, vec2;
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vec1 = vldrhq_f16(pInT1);
+ vec2 = vldrhq_f16(pPRT_in);
+ vec1 = vfmsq_f16(vec1, tmpV, vec2);
+ vstrhq_p_f16(pInT1, vec1, p0);
+ pInT1 += blkCnt;
+ }
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vec1, vec2;
+
+ /*
+ * Replace the element by the sum of that row
+ * and a multiple of the reference row
+ */
+ vec1 = vldrhq_f16(pOutT1);
+ vec2 = vldrhq_f16(pPRT_pDst);
+ vec1 = vfmsq_f16(vec1, tmpV, vec2);
+ vstrhq_f16(pOutT1, vec1);
+ pPRT_pDst += 8;
+ pOutT1 += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ f16x8_t vec1, vec2;
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vec1 = vldrhq_f16(pOutT1);
+ vec2 = vldrhq_f16(pPRT_pDst);
+ vec1 = vfmsq_f16(vec1, tmpV, vec2);
+ vstrhq_p_f16(pOutT1, vec1, p0);
+
+ pInT2 += blkCnt;
+ pOutT1 += blkCnt;
+ }
+ }
+ /*
+ * Increment the temporary input pointer
+ */
+ pInT1 = pInT1 + l;
+ }
+ /*
+ * Increment the input pointer
+ */
+ pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ loopCnt--;
+ /*
+ * Increment the index modifier
+ */
+ l++;
+ }
+
+ /*
+ * Set status as ARM_MATH_SUCCESS
+ */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f16))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f16)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+ /* Return to application */
+ return (status);
+}
+
+#else
+
+arm_status arm_mat_inverse_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ float16_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float16_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float16_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+ _Float16 Xchg, in = 0.0f16, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f16;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0f16;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f16;
+ j--;
+ }
+
+ /* Decrement loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0f16)
+ {
+ /* Loop over the number rows present below */
+
+ for (i = 1U; i < numRows-l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0f16)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0f16))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *pPivotRowIn;
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0U;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while (k > 0U)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement loop counter */
+ k--;
+
+ /* Increment pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f16))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f16)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
new file mode 100644
index 0000000000..cc4a548b50
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -0,0 +1,1570 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_inverse_f32.c
+ * Description: Floating-point matrix inverse
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixInv Matrix Inverse
+
+ Computes the inverse of a matrix.
+
+ The inverse is defined only if the input matrix is square and non-singular (the determinant is non-zero).
+ The function checks that the input and output matrices are square and of the same size.
+
+ Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
+ inversion of floating-point matrices.
+
+ @par Algorithm
+ The Gauss-Jordan method is used to find the inverse.
+ The algorithm performs a sequence of elementary row-operations until it
+ reduces the input matrix to an identity matrix. Applying the same sequence
+ of elementary row-operations to an identity matrix yields the inverse matrix.
+ If the input matrix is singular, then the algorithm terminates and returns error status
+ ARM_MATH_SINGULAR.
+ \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
+ */
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ @brief Floating-point matrix inverse.
+ @param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_inverse_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+ float32_t *pTmpA, *pTmpB;
+
+ float32_t in = 0.0f; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+ uint32_t blkCnt;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+ || (pSrc->numRows != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /*
+ * Working pointer for destination matrix
+ */
+ pOutT1 = pOut;
+ /*
+ * Loop over the number of rows
+ */
+ rowCnt = numRows;
+ /*
+ * Making the destination matrix as identity matrix
+ */
+ while (rowCnt > 0U)
+ {
+ /*
+ * Writing all zeroes in lower triangle of the destination matrix
+ */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+ /*
+ * Writing all ones in the diagonal of the destination matrix
+ */
+ *pOutT1++ = 1.0f;
+ /*
+ * Writing all zeroes in upper triangle of the destination matrix
+ */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+ /*
+ * Decrement the loop counter
+ */
+ rowCnt--;
+ }
+
+ /*
+ * Loop over the number of columns of the input matrix.
+ * All the elements in each column are processed by the row operations
+ */
+ loopCnt = numCols;
+ /*
+ * Index modifier to navigate through the columns
+ */
+ l = 0U;
+ while (loopCnt > 0U)
+ {
+ /*
+ * Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular.
+ */
+
+ /*
+ * Working pointer for the input matrix that points
+ * * to the pivot element of the particular row
+ */
+ pInT1 = pIn + (l * numCols);
+ /*
+ * Working pointer for the destination matrix that points
+ * * to the pivot element of the particular row
+ */
+ pOutT1 = pOut + (l * numCols);
+ /*
+ * Temporary variable to hold the pivot value
+ */
+ in = *pInT1;
+
+
+ /*
+ * Check if the pivot element is zero
+ */
+ if (*pInT1 == 0.0f)
+ {
+ /*
+ * Loop over the number rows present below
+ */
+ for (i = 1U; i < numRows-l; i++)
+ {
+ /*
+ * Update the input and destination pointers
+ */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+ /*
+ * Check if there is a non zero pivot element to
+ * * replace in the rows below
+ */
+ if (*pInT2 != 0.0f)
+ {
+ f32x4_t vecA, vecB;
+ /*
+ * Loop over number of columns
+ * * to the right of the pilot element
+ */
+ pTmpA = pInT1;
+ pTmpB = pInT2;
+ blkCnt = (numCols - l) >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vecA = vldrwq_f32(pTmpA);
+ vecB = vldrwq_f32(pTmpB);
+ vstrwq_f32(pTmpB, vecA);
+ vstrwq_f32(pTmpA, vecB);
+
+ pTmpA += 4;
+ pTmpB += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numCols - l) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecA = vldrwq_f32(pTmpA);
+ vecB = vldrwq_f32(pTmpB);
+ vstrwq_p_f32(pTmpB, vecA, p0);
+ vstrwq_p_f32(pTmpA, vecB, p0);
+ }
+
+ pInT1 += numCols - l;
+ pInT2 += numCols - l;
+ pTmpA = pOutT1;
+ pTmpB = pOutT2;
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vecA = vldrwq_f32(pTmpA);
+ vecB = vldrwq_f32(pTmpB);
+ vstrwq_f32(pTmpB, vecA);
+ vstrwq_f32(pTmpA, vecB);
+ pTmpA += 4;
+ pTmpB += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecA = vldrwq_f32(pTmpA);
+ vecB = vldrwq_f32(pTmpB);
+ vstrwq_p_f32(pTmpB, vecA, p0);
+ vstrwq_p_f32(pTmpA, vecB, p0);
+ }
+
+ pOutT1 += numCols;
+ pOutT2 += numCols;
+ /*
+ * Flag to indicate whether exchange is done or not
+ */
+ flag = 1U;
+
+ /*
+ * Break after exchange is done
+ */
+ break;
+ }
+
+ }
+ }
+
+ /*
+ * Update the status if the matrix is singular
+ */
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /*
+ * Points to the pivot row of input and destination matrices
+ */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /*
+ * Temporary pointers to the pivot row pointers
+ */
+ pInT1 = pPivotRowIn;
+ pOutT1 = pPivotRowDst;
+
+ /*
+ * Pivot element of the row
+ */
+ in = *(pIn + (l * numCols));
+
+ pTmpA = pInT1;
+
+ f32x4_t invIn = vdupq_n_f32(1.0f / in);
+
+ blkCnt = (numCols - l) >> 2;
+ f32x4_t vecA;
+ while (blkCnt > 0U)
+ {
+ *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA * invIn;
+ pTmpA += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = (numCols - l) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+
+ vecA = vldrwq_f32(pTmpA);
+ vecA = vecA * invIn;
+ vstrwq_p_f32(pTmpA, vecA, p0);
+ }
+
+ pInT1 += numCols - l;
+ /*
+ * Loop over number of columns
+ * * to the right of the pilot element
+ */
+
+ pTmpA = pOutT1;
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ *(f32x4_t *) pTmpA = *(f32x4_t *) pTmpA *invIn;
+ pTmpA += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecA = vldrwq_f32(pTmpA);
+ vecA = vecA * invIn;
+ vstrwq_p_f32(pTmpA, vecA, p0);
+ }
+
+ pOutT1 += numCols;
+
+ /*
+ * Replace the rows with the sum of that row and a multiple of row i
+ * * so that each new element in column i above row i is zero.
+ */
+
+ /*
+ * Temporary pointers for input and destination matrices
+ */
+ pInT1 = pIn;
+ pOutT1 = pOut;
+
+ for (i = 0U; i < numRows; i++)
+ {
+ /*
+ * Check for the pivot element
+ */
+ if (i == l)
+ {
+ /*
+ * If the processing element is the pivot element,
+ * only the columns to the right are to be processed
+ */
+ pInT1 += numCols - l;
+ pOutT1 += numCols;
+ }
+ else
+ {
+ /*
+ * Element of the reference row
+ */
+
+ /*
+ * Working pointers for input and destination pivot rows
+ */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+ /*
+ * Loop over the number of columns to the right of the pivot element,
+ * to replace the elements in the input matrix
+ */
+
+ in = *pInT1;
+ f32x4_t tmpV = vdupq_n_f32(in);
+
+ blkCnt = (numCols - l) >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vec1, vec2;
+ /*
+ * Replace the element by the sum of that row
+ * and a multiple of the reference row
+ */
+ vec1 = vldrwq_f32(pInT1);
+ vec2 = vldrwq_f32(pPRT_in);
+ vec1 = vfmsq_f32(vec1, tmpV, vec2);
+ vstrwq_f32(pInT1, vec1);
+ pPRT_in += 4;
+ pInT1 += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (numCols - l) & 3;
+ if (blkCnt > 0U)
+ {
+ f32x4_t vec1, vec2;
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vec1 = vldrwq_f32(pInT1);
+ vec2 = vldrwq_f32(pPRT_in);
+ vec1 = vfmsq_f32(vec1, tmpV, vec2);
+ vstrwq_p_f32(pInT1, vec1, p0);
+ pInT1 += blkCnt;
+ }
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vec1, vec2;
+
+ /*
+ * Replace the element by the sum of that row
+ * and a multiple of the reference row
+ */
+ vec1 = vldrwq_f32(pOutT1);
+ vec2 = vldrwq_f32(pPRT_pDst);
+ vec1 = vfmsq_f32(vec1, tmpV, vec2);
+ vstrwq_f32(pOutT1, vec1);
+ pPRT_pDst += 4;
+ pOutT1 += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ f32x4_t vec1, vec2;
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vec1 = vldrwq_f32(pOutT1);
+ vec2 = vldrwq_f32(pPRT_pDst);
+ vec1 = vfmsq_f32(vec1, tmpV, vec2);
+ vstrwq_p_f32(pOutT1, vec1, p0);
+
+ pInT2 += blkCnt;
+ pOutT1 += blkCnt;
+ }
+ }
+ /*
+ * Increment the temporary input pointer
+ */
+ pInT1 = pInT1 + l;
+ }
+ /*
+ * Increment the input pointer
+ */
+ pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ loopCnt--;
+ /*
+ * Increment the index modifier
+ */
+ l++;
+ }
+
+ /*
+ * Set status as ARM_MATH_SUCCESS
+ */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+ /* Return to application */
+ return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+arm_status arm_mat_inverse_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+
+ float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t tmpV;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+ || (pSrc->numRows != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement the loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0f)
+ {
+ /* Loop over the number rows present below */
+ for (i = 1U; i < numRows - l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0f)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *pPivotRowIn;
+ tmpV = vdupq_n_f32(1.0f/in);
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l) >> 2;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ vec1 = vld1q_f32(pInT1);
+
+ vec1 = vmulq_f32(vec1, tmpV);
+ vst1q_f32(pInT1, vec1);
+ pInT1 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = (numCols - l) & 3;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols >> 2;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ vec1 = vld1q_f32(pInT2);
+
+ vec1 = vmulq_f32(vec1, tmpV);
+ vst1q_f32(pInT2, vec1);
+ pInT2 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = numCols & 3;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0U;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while (k > 0U)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+ tmpV = vdupq_n_f32(in);
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l) >> 2;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ vec1 = vld1q_f32(pInT1);
+ vec2 = vld1q_f32(pPRT_in);
+ vec1 = vmlsq_f32(vec1, tmpV, vec2);
+ vst1q_f32(pInT1, vec1);
+ pPRT_in += 4;
+ pInT1 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = (numCols - l) & 3;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols >> 2;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ vec1 = vld1q_f32(pInT2);
+ vec2 = vld1q_f32(pPRT_pDst);
+ vec1 = vmlsq_f32(vec1, tmpV, vec2);
+ vst1q_f32(pInT2, vec1);
+ pPRT_pDst += 4;
+ pInT2 += 4;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Tail */
+ j = numCols & 3;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment the temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement the loop counter */
+ k--;
+
+ /* Increment the pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_inverse_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float32_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+#if defined (ARM_MATH_DSP)
+
+ float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0f)
+ {
+ /* Loop over the number rows present below */
+
+ for (i = 1U; i < numRows - l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0f)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+
+ /* Decrement loop counter */
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *pPivotRowIn;
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0U;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while (k > 0U)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement loop counter */
+ k--;
+
+ /* Increment pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+
+#else
+
+ float32_t Xchg, in = 0.0f; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0f)
+ {
+ /* Loop over the number rows present below */
+ for (i = 1U; i < numRows-l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0f)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+ }
+
+ for (j = 0U; j < numCols; j++)
+ {
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+ }
+ }
+
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pOutT1 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *(pIn + (l * numCols));
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ *pInT1 = *pInT1 / in;
+ pInT1++;
+ }
+ for (j = 0U; j < numCols; j++)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ *pOutT1 = *pOutT1 / in;
+ pOutT1++;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pOutT1 = pOut;
+
+ for (i = 0U; i < numRows; i++)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+ pOutT1 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pInT1 = *pInT1 - (in * *pPRT_in++);
+ pInT1++;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ for (j = 0U; j < numCols; j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
+ pOutT1++;
+ }
+
+ }
+
+ /* Increment temporary input pointer */
+ pInT1 = pInT1 + l;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0f))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0f)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
new file mode 100644
index 0000000000..3d350d982d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_inverse_f64.c
@@ -0,0 +1,644 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_inverse_f64.c
+ * Description: Floating-point matrix inverse
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ @brief Floating-point (64 bit) matrix inverse.
+ @param[in] pSrc points to input matrix structure. The source matrix is modified by the function.
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_SINGULAR : Input matrix is found to be singular (non-invertible)
+ */
+
+arm_status arm_mat_inverse_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float64_t *pOutT1, *pOutT2; /* Temporary output data matrix pointer */
+ float64_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+#if defined (ARM_MATH_DSP)
+
+ float64_t Xchg, in = 0.0, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, k,l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0;
+ j--;
+ }
+
+ /* Decrement loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0)
+ {
+ /* Loop over the number rows present below */
+
+ for (i = 1U; i < numRows - l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+
+ /* Decrement loop counter */
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *pPivotRowIn;
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0U;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while (k > 0U)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l);
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols;
+
+ while (j > 0U)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement loop counter */
+ k--;
+
+ /* Increment pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+
+#else
+
+ float64_t Xchg, in = 0.0; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0U, j, loopCnt, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pDst->numRows != pDst->numCols) ||
+ (pSrc->numRows != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pOutT1 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while (rowCnt > 0U)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pOutT1++ = 1.0;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1U;
+ while (j > 0U)
+ {
+ *pOutT1++ = 0.0;
+ j--;
+ }
+
+ /* Decrement loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0U;
+
+ while (loopCnt > 0U)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pOutT1 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Check if the pivot element is zero */
+ if (*pInT1 == 0.0)
+ {
+ /* Loop over the number rows present below */
+ for (i = 1U; i < numRows-l; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * i);
+ pOutT2 = pOutT1 + (numCols * i);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if (*pInT2 != 0.0)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+ }
+
+ for (j = 0U; j < numCols; j++)
+ {
+ Xchg = *pOutT2;
+ *pOutT2++ = *pOutT1;
+ *pOutT1++ = Xchg;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1U;
+
+ /* Break after exchange is done */
+ break;
+ }
+ }
+ }
+
+
+ /* Update the status if the matrix is singular */
+ if ((flag != 1U) && (in == 0.0))
+ {
+ return ARM_MATH_SINGULAR;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pOutT1 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *(pIn + (l * numCols));
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ *pInT1 = *pInT1 / in;
+ pInT1++;
+ }
+ for (j = 0U; j < numCols; j++)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ *pOutT1 = *pOutT1 / in;
+ pOutT1++;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pOutT1 = pOut;
+
+ for (i = 0U; i < numRows; i++)
+ {
+ /* Check for the pivot element */
+ if (i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+ pOutT1 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ for (j = 0U; j < (numCols - l); j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pInT1 = *pInT1 - (in * *pPRT_in++);
+ pInT1++;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ for (j = 0U; j < numCols; j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pOutT1 = *pOutT1 - (in * *pPRT_pDst++);
+ pOutT1++;
+ }
+
+ }
+
+ /* Increment temporary input pointer */
+ pInT1 = pInT1 + l;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if ((flag != 1U) && (in == 0.0))
+ {
+ pIn = pSrc->pData;
+ for (i = 0; i < numRows * numCols; i++)
+ {
+ if (pIn[i] != 0.0)
+ break;
+ }
+
+ if (i == numRows * numCols)
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
new file mode 100644
index 0000000000..dfc854c3e9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f32.c
@@ -0,0 +1,500 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_ldl_f32.c
+ * Description: Floating-point LDL decomposition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+
+/// @private
+#define SWAP_ROWS_F32(A,i,j) \
+ { \
+ int cnt = n; \
+ \
+ for(int w=0;w < n; w+=4) \
+ { \
+ f32x4_t tmpa,tmpb; \
+ mve_pred16_t p0 = vctp32q(cnt); \
+ \
+ tmpa=vldrwq_z_f32(&A[i*n + w],p0);\
+ tmpb=vldrwq_z_f32(&A[j*n + w],p0);\
+ \
+ vstrwq_p(&A[i*n + w], tmpb, p0); \
+ vstrwq_p(&A[j*n + w], tmpa, p0); \
+ \
+ cnt -= 4; \
+ } \
+ }
+
+/// @private
+#define SWAP_COLS_F32(A,i,j) \
+ for(int w=0;w < n; w++) \
+ { \
+ float32_t tmp; \
+ tmp = A[w*n + i]; \
+ A[w*n + i] = A[w*n + j];\
+ A[w*n + j] = tmp; \
+ }
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pl points to the instance of the output floating-point triangular matrix structure.
+ * @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
+ * @param[out] pp points to the instance of the output floating-point permutation vector.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+ */
+arm_status arm_mat_ldlt_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pl,
+ arm_matrix_instance_f32 * pd,
+ uint16_t * pp)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pl->numRows != pl->numCols) ||
+ (pd->numRows != pd->numCols) ||
+ (pl->numRows != pd->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ const int n=pSrc->numRows;
+ int fullRank = 1, diag,k;
+ float32_t *pA;
+
+ memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
+ pA = pl->pData;
+
+ int cnt = n;
+ uint16x8_t vecP;
+
+ for(int k=0;k < n; k+=8)
+ {
+ mve_pred16_t p0;
+ p0 = vctp16q(cnt);
+
+ vecP = vidupq_u16((uint16_t)k, 1);
+
+ vstrhq_p(&pp[k], vecP, p0);
+
+ cnt -= 8;
+ }
+
+
+ for(k=0;k < n; k++)
+ {
+ /* Find pivot */
+ float32_t m=F32_MIN,a;
+ int j=k;
+
+
+ for(int r=k;r m)
+ {
+ m = pA[r*n+r];
+ j = r;
+ }
+ }
+
+ if(j != k)
+ {
+ SWAP_ROWS_F32(pA,k,j);
+ SWAP_COLS_F32(pA,k,j);
+ }
+
+
+ pp[k] = j;
+
+ a = pA[k*n+k];
+
+ if (fabs(a) < 1.0e-8)
+ {
+
+ fullRank = 0;
+ break;
+ }
+
+ float32_t invA;
+
+ invA = 1.0f / a;
+
+ int32x4_t vecOffs;
+ int w;
+ vecOffs = vidupq_u32((uint32_t)0, 1);
+ vecOffs = vmulq_n_s32(vecOffs,n);
+
+ for(w=k+1; wpData[row*n+col], zero, p0);
+
+ cnt -= 4;
+ }
+ }
+ }
+
+ for(int row=0; row < n;row++)
+ {
+ mve_pred16_t p0;
+ int cnt= n-row-1;
+ f32x4_t zero=vdupq_n_f32(0.0f);
+
+ for(int col=row+1; col < n;col+=4)
+ {
+ p0 = vctp32q(cnt);
+
+ vstrwq_p(&pl->pData[row*n+col], zero, p0);
+
+ cnt -= 4;
+ }
+ }
+
+ for(int d=0; d < diag;d++)
+ {
+ pd->pData[d*n+d] = pl->pData[d*n+d];
+ pl->pData[d*n+d] = 1.0;
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+#else
+
+/// @private
+#define SWAP_ROWS_F32(A,i,j) \
+ for(int w=0;w < n; w++) \
+ { \
+ float32_t tmp; \
+ tmp = A[i*n + w]; \
+ A[i*n + w] = A[j*n + w];\
+ A[j*n + w] = tmp; \
+ }
+
+/// @private
+#define SWAP_COLS_F32(A,i,j) \
+ for(int w=0;w < n; w++) \
+ { \
+ float32_t tmp; \
+ tmp = A[w*n + i]; \
+ A[w*n + i] = A[w*n + j];\
+ A[w*n + j] = tmp; \
+ }
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pl points to the instance of the output floating-point triangular matrix structure.
+ * @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
+ * @param[out] pp points to the instance of the output floating-point permutation vector.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+ */
+arm_status arm_mat_ldlt_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pl,
+ arm_matrix_instance_f32 * pd,
+ uint16_t * pp)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pl->numRows != pl->numCols) ||
+ (pd->numRows != pd->numCols) ||
+ (pl->numRows != pd->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ const int n=pSrc->numRows;
+ int fullRank = 1, diag,k;
+ float32_t *pA;
+
+ memcpy(pl->pData,pSrc->pData,n*n*sizeof(float32_t));
+ pA = pl->pData;
+
+ for(int k=0;k < n; k++)
+ {
+ pp[k] = k;
+ }
+
+
+ for(k=0;k < n; k++)
+ {
+ /* Find pivot */
+ float32_t m=F32_MIN,a;
+ int j=k;
+
+
+ for(int r=k;r m)
+ {
+ m = pA[r*n+r];
+ j = r;
+ }
+ }
+
+ if(j != k)
+ {
+ SWAP_ROWS_F32(pA,k,j);
+ SWAP_COLS_F32(pA,k,j);
+ }
+
+
+ pp[k] = j;
+
+ a = pA[k*n+k];
+
+ if (fabs(a) < 1.0e-8)
+ {
+
+ fullRank = 0;
+ break;
+ }
+
+ for(int w=k+1;wpData[row*n+col]=0.0;
+ }
+ }
+ }
+
+ for(int row=0; row < n;row++)
+ {
+ for(int col=row+1; col < n;col++)
+ {
+ pl->pData[row*n+col] = 0.0;
+ }
+ }
+
+ for(int d=0; d < diag;d++)
+ {
+ pd->pData[d*n+d] = pl->pData[d*n+d];
+ pl->pData[d*n+d] = 1.0;
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixChol group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c
new file mode 100644
index 0000000000..1daf35993f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_ldlt_f64.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_ldl_f64.c
+ * Description: Floating-point LDL decomposition
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+#include
+
+
+
+/// @private
+#define SWAP_ROWS_F64(A,i,j) \
+ for(int w=0;w < n; w++) \
+ { \
+ float64_t tmp; \
+ tmp = A[i*n + w]; \
+ A[i*n + w] = A[j*n + w];\
+ A[j*n + w] = tmp; \
+ }
+/// @private
+#define SWAP_COLS_F64(A,i,j) \
+ for(int w=0;w < n; w++) \
+ { \
+ float64_t tmp; \
+ tmp = A[w*n + i]; \
+ A[w*n + i] = A[w*n + j];\
+ A[w*n + j] = tmp; \
+ }
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixChol
+ @{
+ */
+
+/**
+ * @brief Floating-point LDL^t decomposition of positive semi-definite matrix.
+ * @param[in] pSrc points to the instance of the input floating-point matrix structure.
+ * @param[out] pl points to the instance of the output floating-point triangular matrix structure.
+ * @param[out] pd points to the instance of the output floating-point diagonal matrix structure.
+ * @param[out] pp points to the instance of the output floating-point permutation vector.
+ * @return The function returns ARM_MATH_SIZE_MISMATCH, if the dimensions do not match.
+ * @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ - \ref ARM_MATH_DECOMPOSITION_FAILURE : Input matrix cannot be decomposed
+ * @par
+ * Computes the LDL^t decomposition of a matrix A such that P A P^t = L D L^t.
+ */
+
+arm_status arm_mat_ldlt_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pl,
+ arm_matrix_instance_f64 * pd,
+ uint16_t * pp)
+{
+
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pSrc->numCols) ||
+ (pl->numRows != pl->numCols) ||
+ (pd->numRows != pd->numCols) ||
+ (pl->numRows != pd->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ const int n=pSrc->numRows;
+ int fullRank = 1, diag,k;
+ float64_t *pA;
+
+ memcpy(pl->pData,pSrc->pData,n*n*sizeof(float64_t));
+ pA = pl->pData;
+
+ for(int k=0;k < n; k++)
+ {
+ pp[k] = k;
+ }
+
+
+ for(k=0;k < n; k++)
+ {
+ /* Find pivot */
+ float64_t m=F64_MIN,a;
+ int j=k;
+
+
+ for(int r=k;r m)
+ {
+ m = pA[r*n+r];
+ j = r;
+ }
+ }
+
+ if(j != k)
+ {
+ SWAP_ROWS_F64(pA,k,j);
+ SWAP_COLS_F64(pA,k,j);
+ }
+
+
+ pp[k] = j;
+
+ a = pA[k*n+k];
+
+ if (fabs(a) < 1.0e-18)
+ {
+
+ fullRank = 0;
+ break;
+ }
+
+ for(int w=k+1;wpData[row*n+col]=0.0;
+ }
+ }
+ }
+
+ for(int row=0; row < n;row++)
+ {
+ for(int col=row+1; col < n;col++)
+ {
+ pl->pData[row*n+col] = 0.0;
+ }
+ }
+
+ for(int d=0; d < diag;d++)
+ {
+ pd->pData[d*n+d] = pl->pData[d*n+d];
+ pl->pData[d*n+d] = 1.0;
+ }
+
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixChol group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
new file mode 100644
index 0000000000..08da932979
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f16.c
@@ -0,0 +1,763 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_f16.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_2x2_mve(
+ const arm_matrix_instance_f16 *pSrcA,
+ const arm_matrix_instance_f16 *pSrcB,
+ arm_matrix_instance_f16 *pDst)
+{
+ static const uint16_t offsetA[8] = { 0, 0, 2, 2, 0, 0, 2, 2 };
+ /* offsetB allows to read and duplicate 1 row of B */
+ static const uint16_t offsetB[8] = { 0, 1, 0, 1, 0, 1, 0, 1 };
+ uint16x8_t vecOffsA, vecOffsB;
+ f16x8_t vecInA, vecInB, vecDst;
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ /*
+ * load initial offsets
+ */
+ vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+ vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+ /*
+ * load {a00 a00 a10 a10 x x x x }
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * load {b00 b01 b00 b01 x x x x }
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 a00 b01
+ * a10 b00 a10 b01
+ * x x
+ * x x }
+ */
+ vecDst = vmulq(vecInA, vecInB);
+ /*
+ * move to 2nd column of matrix A
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+ /*
+ * load {a01 a01 a11 a11 x x x x}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 2);
+ /*
+ * load {b10, b11, b10, b11, x x x x }
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 + a01 b10 a00 b01 + a01 b11
+ * a10 b00 + a11 b10 a10 b01 + a11 b11
+ * x x
+ * x x }
+ */
+ vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+ mve_pred16_t p0 = vctp16q(2*2);
+ /*
+ * Store the result in the destination buffer
+ * (lower half of the vector)
+ */
+ vstrhq_p(pOut, vecDst, p0);
+
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_3x3_mve(
+ const arm_matrix_instance_f16 *pSrcA,
+ const arm_matrix_instance_f16 *pSrcB,
+ arm_matrix_instance_f16 *pDst)
+{
+ static const uint16_t offsetA[8] = { 0, 0, 0, 3, 3, 3, 6, 6 };
+ /* offsetB allows to read and duplicate 1 row of B */
+ static const uint16_t offsetB[8] = { 0, 1, 2, 0, 1, 2, 0, 1 };
+ uint16x8_t vecOffsA, vecOffsB;
+ f16x8_t vecInA, vecInB, vecDst;
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ /*
+ * load initial offsets
+ */
+ vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+ vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+ /*
+ * load {a00 a00 a00 a10 a10 a10 a20 a20}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * load {b00 b01 b02 b00 b01 b02 b00 b01}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 a00 b01 a00 b02
+ * a10 b00 a10 b01 a10 b02
+ * a20 b00 a20 b01}
+ */
+ vecDst = vmulq(vecInA, vecInB);
+
+ /*
+ * move to 2nd column of matrix A
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+ /*
+ * load {a01 a01 a01 a11 a11 a11 a21 a21}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
+ /*
+ * load {b10, b11, b12, b10, b11, b12, b10, b11}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12
+ * a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12
+ * a20 b00 + a21 b10 a20 b01 + a21 b11 }
+ */
+ vecDst = vfmaq(vecDst, vecInA, vecInB);
+ /*
+ * move to 3rd column of matrix A
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 1);
+ /*
+ * load {a02 a02 a02 a12 a12 a12 a22 a22}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 3);
+ /*
+ * load {b20, b21, b22, b20, b21, b22, b20, b21}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * {a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22},
+ * a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22},
+ * a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 }
+ */
+ vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+ /*
+ * Store the result in the destination buffer
+ */
+ vst1q(pOut, vecDst); pOut += 8;
+
+ /* last element computed in scalar mode
+ * a20 b02 + a21 b12 + a22 b22
+ */
+ _Float16 * pA = (_Float16 *)pSrcA->pData;
+ _Float16 * pB = (_Float16 *)pSrcB->pData;
+ *pOut = pA[2*3] * pB[2] + pA[2*3+1] * pB[3+2] + pA[2*3+2] * pB[2*3+2];
+
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_f16_4x4_mve(
+ const arm_matrix_instance_f16 *pSrcA,
+ const arm_matrix_instance_f16 *pSrcB,
+ arm_matrix_instance_f16 *pDst)
+{
+ /* offsetA allows to read and duplicate 2 successive column elements of A */
+ static const uint16_t offsetA[8] = { 0, 0, 0, 0, 4, 4, 4, 4 };
+ /* offsetB allows to read and duplicate 1 row of B */
+ static const uint16_t offsetB[8] = { 0, 1, 2, 3, 0, 1, 2, 3 };
+ uint16x8_t vecOffsA, vecOffsB;
+ f16x8_t vecInA, vecInB, vecDst0, vecDst1;
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ /*
+ * load initial offsets
+ */
+ vecOffsA = vldrhq_u16((uint16_t const *) offsetA);
+ vecOffsB = vldrhq_u16((uint16_t const *) offsetB);
+
+ /*
+ * load {a00 a00 a00 a00 a10 a10 a10 a10}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * load {b00 b01 b02 b03 b00 b01 b02 b03}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 a00 b01 a00 b02 a00 b03
+ * a10 b00 a10 b01 a10 b02 a10 b03 }
+ */
+ vecDst0 = vmulq(vecInA, vecInB);
+ /*
+ * jump 2 x A rows (2nd half of matrix)
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+ /*
+ * load {a20 a20 a20 a20 a30 a30 a30 a30}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * { a20 b00 a20 b01 a20 b02 a20 b03
+ * a30 b00 a30 b01 a30 b02 + a31 b12 }
+ */
+ vecDst1 = vmulq(vecInA, vecInB);
+ /*
+ * rewind back to top half of the A matrix (2nd column)
+ */
+ vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+ /*
+ * load {a01 a01 a01 a01 a11 a11 a11 a11}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+ /*
+ * load {b10, b11, b12, b13, b10, b11, b12, b13}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 + a01 b10 a00 b01 + a01 b11 a00 b02 + a01 b12 a00 b03 + a01 b13
+ * a10 b00 + a11 b10 a10 b01 + a11 b11 a10 b02 + a11 b12 a10 b03 + a11 b13 }
+ */
+ vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+ /*
+ * jump 2 x A rows (2nd half of matrix)
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+ /*
+ * load {a21 a21 a21 a21 a31 a31 a31 a31}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * {a20 b00 + a21 b10 a20 b01 + a21 b11 a20 b02 + a21 b12 a20 b03 + a21 b13
+ * a30 b00 + a31 b10 a30 b01 + a31 b11 a30 b02 + a31 b12 a30 b03 + a31 b13 }
+ */
+ vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+ /*
+ * rewind back to top half of the A matrix (3rd column)
+ */
+ vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+ /*
+ * load {a02 a02 a02 a02 a12 a12 a12 a12}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+ /*
+ * load {b20, b21, b22, b23, b20, b21, b22, b23}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 + a01 b10 + a02 b20 a00 b01 + a01 b11 + a02 b21 a00 b02 + a01 b12 + a02 b22 a00 b03 + a01 b13 + a02 b23
+ * a10 b00 + a11 b10 + a12 b20 a10 b01 + a11 b11 + a12 b21 a10 b02 + a11 b12 + a12 b22 a10 b03 + a11 b13 + a12 b23 }
+ */
+ vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+ /*
+ * jump 2 x A rows
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+
+ /*
+ * load {a22 a22 a22 a22 a32 a32 a32 a32}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * {a20 b00 + a21 b10 + a22 b20 a20 b01 + a21 b11 + a22 b21 a20 b02 + a21 b12 + a22 b22 a20 b03 + a21 b13 + a22 b23
+ * a30 b00 + a31 b10 + a32 b20 a30 b01 + a31 b11 + a32 b21 a30 b02 + a31 b12 + a32 b22 a30 b03 + a31 b13 + a32 b23 }
+ */
+ vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+ /*
+ * rewind back to top half of the A matrix (4th column)
+ */
+ vecOffsA = vsubq(vecOffsA, (uint16_t) 7);
+ /*
+ * load {a03 a03 a03 a03 a13 a13 a13 a13}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * move to next B row
+ */
+ vecOffsB = vaddq_n_u16(vecOffsB, (uint16_t) 4);
+ /*
+ * load {b30, b31, b32, b33, b30, b31, b32, b33}
+ */
+ vecInB = vldrhq_gather_shifted_offset((float16_t const *) pSrcB->pData, vecOffsB);
+ /*
+ * { a00 b00 +...+ a03 b30, a00 b01 +...+ a03 b31, a00 b02 +...+ a03 b32, a00 b03 +...+ a03 b33
+ * a10 b00 +...+ a13 b30, a10 b01 +...+ a13 b31, a10 b02 +...+ a13 b32, a10 b03 +...+ a13 b33 }
+ */
+ vecDst0 = vfmaq(vecDst0, vecInA, vecInB);
+ /*
+ * jump 2 x A rows
+ */
+ vecOffsA = vaddq_n_u16(vecOffsA, (uint16_t) 8);
+ /*
+ * load {a23 a23 a23 a23 a33 a33 a33 a33}
+ */
+ vecInA = vldrhq_gather_shifted_offset((float16_t const *) pSrcA->pData, vecOffsA);
+ /*
+ * {a20 b00 +...+ a23 b30, a20 b01 +...+ a23 b31, a20 b02 +...+ a23 b32, a20 b03 +...+ a23 b33
+ * a30 b00 +...+ a33 b30, a30 b01 +...+ a33 b31, a30 b02 +...+ a33 b32, a30 b03 +...+ a33 b33 }
+ */
+ vecDst1 = vfmaq(vecDst1, vecInA, vecInB);
+
+ /*
+ * Store the result in the destination buffer
+ */
+ vst1q(pOut, vecDst0); pOut += 8;
+ vst1q(pOut, vecDst1);
+
+ return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_mult_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint32_t blkCnt; /* loop counters */
+ int i;
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ return(ARM_MATH_SIZE_MISMATCH);
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+{
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+ if(numRowsA == 2)
+ return arm_mat_mult_f16_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_f16_3x3_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 4)
+ return arm_mat_mult_f16_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ /* main loop process 4 rows */
+ i = numRowsA / 4;
+ while(i > 0)
+ {
+ float16_t *pInA0, *pInA1, *pInA2, *pInA3;
+ float16_t *pInB0;
+ float16_t *pOut0, *pOut1, *pOut2, *pOut3;
+ f16x8_t vecMac0, vecMac1, vecMac2, vecMac3;
+ f16x8_t vecInB;
+
+ /* pointers to 4 consecutive output rows */
+ pOut0 = pOut;
+ pOut1 = pOut0 + numColsB;
+ pOut2 = pOut1 + numColsB;
+ pOut3 = pOut2 + numColsB;
+ pInB0 = pInB;
+
+ int k = numColsB >> 3;
+ while(k > 0)
+ {
+ /* pointers to 4 consecutive Matrix A rows */
+ pInA0 = pInA;
+ pInA1 = pInA0 + numColsA;
+ pInA2 = pInA1 + numColsA;
+ pInA3 = pInA2 + numColsA;
+
+ vecMac0 = vdupq_n_f16(0.0f16);
+ vecMac1 = vdupq_n_f16(0.0f16);
+ vecMac2 = vdupq_n_f16(0.0f16);
+ vecMac3 = vdupq_n_f16(0.0f16);
+
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3..., bi,4n+7}
+ */
+ vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Store the results (4 x 8 block) in the destination buffer */
+ vst1q(pOut0, vecMac0); pOut0 += 8;
+ vst1q(pOut1, vecMac1); pOut1 += 8;
+ vst1q(pOut2, vecMac2); pOut2 += 8;
+ vst1q(pOut3, vecMac3); pOut3 += 8;
+ /*
+ * rewind
+ */
+ pInB0 -= (numColsB * numColsA) - 8;
+ k--;
+ }
+
+ int colBLeft = numColsB & 7;
+ if (colBLeft)
+ {
+ pInA0 = pInA;
+ pInA1 = pInA0 + numColsA;
+ pInA2 = pInA1 + numColsA;
+ pInA3 = pInA2 + numColsA;
+ mve_pred16_t p0 = vctp16q(colBLeft);
+
+ vecMac0 = vdupq_n_f16(0.0f16);
+ vecMac1 = vdupq_n_f16(0.0f16);
+ vecMac2 = vdupq_n_f16(0.0f16);
+ vecMac3 = vdupq_n_f16(0.0f16);
+
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, ..bi,4n+colBLeft-1, 0, ..}
+ */
+ vecInB = vldrhq_z_f16(pInB0, p0);
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Store the results (4 x colBLeft block) in the destination buffer */
+ vstrhq_p_f16(pOut0, vecMac0, p0);
+ vstrhq_p_f16(pOut1, vecMac1, p0);
+ vstrhq_p_f16(pOut2, vecMac2, p0);
+ vstrhq_p_f16(pOut3, vecMac3, p0);
+ }
+
+ pInA += 4 * numColsA;
+ pOut += 4 * numColsB;
+ i--;
+ }
+
+ /*
+ * non multiple of 4 rows for Matrix A
+ * process single row
+ */
+ if (numRowsA & 3)
+ {
+ i = numRowsA & 3;
+ do
+ {
+ float16_t *pInA0;
+ float16_t *pInB0;
+ float16_t *pOut0;
+ f16x8_t vecInB;
+ f16x8_t vecMac0;
+
+ pOut0 = pOut;
+ pInB0 = pInB;
+
+ int k = numColsB >> 3;
+ while(k > 0)
+ {
+ pInA0 = pInA;
+
+ vecMac0 = vdupq_n_f16(0.0f16);
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3, ...bi,4n+7}
+ */
+ vecInB = *(f16x8_t *)pInB0; /* vldrhq_f16(pInB0, 0); */
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /* Store the results (1 x 8 block) in the destination buffer */
+ vst1q(pOut0, vecMac0); pOut0 += 8;
+ /*
+ * rewind
+ */
+ pInB0 -= (numColsB * numColsA) - 8;
+ k--;
+ }
+
+ int colBLeft = numColsB & 7;
+ if (colBLeft)
+ {
+ pInA0 = pInA;
+ mve_pred16_t p0 = vctp16q(colBLeft);
+
+ vecMac0 = vdupq_n_f16(0.0f16);
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, ..., bi,4n+colBLeft, 0, ...}
+ */
+ vecInB = vldrhq_z_f16(pInB0, p0);
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /* Store the results (1 x colBLeft block) in the destination buffer */
+ vstrhq_p_f16(pOut0, vecMac0, p0);
+ }
+
+ pInA += 1 * numColsA;
+ pOut += 1 * numColsB;
+ }
+ while (--i);
+ }
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+ }
+}
+#else
+
+
+arm_status arm_mat_mult_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float16_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float16_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float16_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ float16_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float16_t *px; /* Temporary output data matrix pointer */
+ _Float16 sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f16;
+
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sum;
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
new file mode 100644
index 0000000000..a2bb15c61e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -0,0 +1,983 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_f32.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an M x N matrix with an N x P matrix results
+ * in an M x P matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * pSrcA and pSrcB are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of pSrcA and pSrcB.
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_mult_f32_2x2_mve(
+ const arm_matrix_instance_f32 *pSrcA,
+ const arm_matrix_instance_f32 *pSrcB,
+ arm_matrix_instance_f32 *pDst)
+{
+ /* {a00, a00, a10, a10} */
+ static const uint32_t offsetA0[4] = { 0, 0, 2, 2 };
+ /* {b00, b01, b00, b01} */
+ static const uint32_t offsetB0[4] = { 0, 1, 0, 1 };
+ /* {a01, a01, a11, a11} */
+ static const uint32_t offsetA1[4] = { 1, 1, 3, 3 };
+ /* {b10, b11, b10, b11} */
+ static const uint32_t offsetB1[4] = { 2, 3, 2, 3 };
+
+ uint32x4_t vecOffsA, vecOffsB;
+ f32x4_t vecInA, vecInB, vecDst;
+
+ vecOffsA = vldrwq_u32((uint32_t const *) offsetA0);
+ vecOffsB = vldrwq_u32((uint32_t const *) offsetB0);
+
+ vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
+ vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
+
+ vecDst = vmulq(vecInA, vecInB);
+
+ vecOffsA = vldrwq_u32((uint32_t const *) offsetA1);
+ vecOffsB = vldrwq_u32((uint32_t const *) offsetB1);
+
+ vecInA = vldrwq_gather_shifted_offset((float32_t const *) pSrcA->pData, vecOffsA);
+ vecInB = vldrwq_gather_shifted_offset((float32_t const *) pSrcB->pData, vecOffsB);
+
+ vecDst = vfmaq(vecDst, vecInA, vecInB);
+
+ vstrwq_f32(pDst->pData, vecDst);
+
+ return (ARM_MATH_SUCCESS);
+
+}
+
+
+/*
+ * A = {{a00, a01, a02},
+ * {a10, a11, a12},
+ * {a20, a21, a22}}
+ * B = {{b00, b01, b02},
+ * {b10, b11, b12},
+ * {b20, b21, b22}}
+ *
+ * Dst = {{a00 b00 + a01 b10 + a02 b20, a00 b01 + a01 b11 + a02 b21, a00 b02 + a01 b12 + a02 b22},
+ * {a10 b00 + a11 b10 + a12 b20, a10 b01 + a11 b11 + a12 b21, a10 b02 + a11 b12 + a12 b22},
+ * {a20 b00 + a21 b10 + a22 b20, a20 b01 + a21 b11 + a22 b21, a20 b02 + a21 b12 + a22 b22}}
+ */
+__STATIC_INLINE arm_status arm_mat_mult_f32_3x3_mve(
+ const arm_matrix_instance_f32 *pSrcA,
+ const arm_matrix_instance_f32 *pSrcB,
+ arm_matrix_instance_f32 *pDst)
+{
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInA0, *pInA1, *pInA2;
+ f32x4_t vecMac0, vecMac1, vecMac2;
+ f32x4_t vecInB;
+ float32_t const *pSrBVec;
+
+ pSrBVec = (float32_t const *) pInB;
+
+ pInA0 = pInA;
+ pInA1 = pInA0 + MATRIX_DIM3;
+ pInA2 = pInA1 + MATRIX_DIM3;
+ /* enable predication to disable last (4th) vector element */
+ mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
+
+ /*
+ * load {b0,0, b0,1, b0,2, 0}
+ */
+ vecInB = vldrwq_z_f32(pSrBVec, p0);
+ pSrBVec += MATRIX_DIM3;
+
+ vecMac0 = vmulq(vecInB, *pInA0++);
+ vecMac1 = vmulq(vecInB, *pInA1++);
+ vecMac2 = vmulq(vecInB, *pInA2++);
+ /*
+ * load {b1,0, b1,1, b1,2, 0}
+ */
+ vecInB = vldrwq_z_f32(pSrBVec, p0);
+ pSrBVec += MATRIX_DIM3;
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ /*
+ * load {b2,0, b2,1 , b2,2, 0}
+ */
+ vecInB = vldrwq_z_f32(pSrBVec, p0);
+ pSrBVec += MATRIX_DIM3;
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+
+ /* partial vector stores */
+ vstrwq_p_f32(pOut, vecMac0, p0);
+ pOut += MATRIX_DIM3;
+ vstrwq_p_f32(pOut, vecMac1, p0);
+ pOut += MATRIX_DIM3;
+ vstrwq_p_f32(pOut, vecMac2, p0);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_f32_4x4_mve(
+ const arm_matrix_instance_f32 *pSrcA,
+ const arm_matrix_instance_f32 *pSrcB,
+ arm_matrix_instance_f32 *pDst)
+{
+ float32_t const *pSrBVec;
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInA0, *pInA1, *pInA2, *pInA3;
+ f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
+ f32x4_t vecInB;
+
+ pSrBVec = (float32_t const *) pInB;
+
+ pInA0 = pInA;
+ pInA1 = pInA0 + MATRIX_DIM4;
+ pInA2 = pInA1 + MATRIX_DIM4;
+ pInA3 = pInA2 + MATRIX_DIM4;
+ /*
+ * load {b0,0, b0,1, b0,2, b0,3}
+ */
+ vecInB = vld1q(pSrBVec);
+ pSrBVec += MATRIX_DIM4;
+
+ vecMac0 = vmulq(vecInB, *pInA0++);
+ vecMac1 = vmulq(vecInB, *pInA1++);
+ vecMac2 = vmulq(vecInB, *pInA2++);
+ vecMac3 = vmulq(vecInB, *pInA3++);
+ /*
+ * load {b1,0, b1,1, b1,2, b1,3}
+ */
+ vecInB = vld1q(pSrBVec);
+ pSrBVec += MATRIX_DIM4;
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+ /*
+ * load {b2,0, b2,1, b2,2, b2,3}
+ */
+ vecInB = vld1q(pSrBVec);
+ pSrBVec += MATRIX_DIM4;
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+ /*
+ * load {b3,0, b3,1, b3,2, b3,3}
+ */
+ vecInB = vld1q(pSrBVec);
+ pSrBVec += MATRIX_DIM4;
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+ vst1q(pOut, vecMac0);
+ pOut += MATRIX_DIM4;
+ vst1q(pOut, vecMac1);
+ pOut += MATRIX_DIM4;
+ vst1q(pOut, vecMac2);
+ pOut += MATRIX_DIM4;
+ vst1q(pOut, vecMac3);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+arm_status arm_mat_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ int numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ int numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ int numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint32_t blkCnt; /* loop counters */
+ uint32_t i;
+ arm_status status;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+ if (numRowsA == 1)
+ {
+ pOut[0] = pInA[0] * pInB[0];
+ return(ARM_MATH_SUCCESS);
+ }
+ else if(numRowsA == 2)
+ return arm_mat_mult_f32_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_f32_3x3_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 4)
+ return arm_mat_mult_f32_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ /* main loop process 4 rows */
+ i = numRowsA >> 2;
+ while (i > 0U)
+ {
+ float32_t *pInA0, *pInA1, *pInA2, *pInA3;
+ float32_t *pInB0;
+ float32_t *pOut0, *pOut1, *pOut2, *pOut3;
+ f32x4_t vecMac0, vecMac1, vecMac2, vecMac3;
+ f32x4_t vecInB;
+
+ /* pointers to 4 consecutive output rows */
+ pOut0 = pOut;
+ pOut1 = pOut0 + numColsB;
+ pOut2 = pOut1 + numColsB;
+ pOut3 = pOut2 + numColsB;
+ pInB0 = pInB;
+
+ uint32_t k = numColsB >> 2;
+ while (k > 0U)
+ {
+ /* pointers to 4 consecutive Matrix A rows */
+ pInA0 = pInA;
+ pInA1 = pInA0 + numColsA;
+ pInA2 = pInA1 + numColsA;
+ pInA3 = pInA2 + numColsA;
+
+ vecMac0 = vdupq_n_f32(0.0f);
+ vecMac1 = vdupq_n_f32(0.0f);
+ vecMac2 = vdupq_n_f32(0.0f);
+ vecMac3 = vdupq_n_f32(0.0f);
+
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+ */
+ vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Store the results (4 x 4 block) in the destination buffer */
+ vst1q(pOut0, vecMac0);
+ pOut0 += 4;
+ vst1q(pOut1, vecMac1);
+ pOut1 += 4;
+ vst1q(pOut2, vecMac2);
+ pOut2 += 4;
+ vst1q(pOut3, vecMac3);
+ pOut3 += 4;
+
+ /*
+ * rewind
+ */
+ pInB0 -= (numColsB * numColsA) - 4;
+ k--;
+ }
+
+ int colBLeft = numColsB & 3;
+ if (colBLeft)
+ {
+ pInA0 = pInA;
+ pInA1 = pInA0 + numColsA;
+ pInA2 = pInA1 + numColsA;
+ pInA3 = pInA2 + numColsA;
+ mve_pred16_t p0 = vctp32q(colBLeft);
+
+ vecMac0 = vdupq_n_f32(0.0f);
+ vecMac1 = vdupq_n_f32(0.0f);
+ vecMac2 = vdupq_n_f32(0.0f);
+ vecMac3 = vdupq_n_f32(0.0f);
+
+ blkCnt = numColsA;
+
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+ */
+ vecInB = vldrwq_z_f32(pInB0, p0);
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+ vecMac1 = vfmaq(vecMac1, vecInB, *pInA1++);
+ vecMac2 = vfmaq(vecMac2, vecInB, *pInA2++);
+ vecMac3 = vfmaq(vecMac3, vecInB, *pInA3++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Store the results (4 x colBLeft block) in the destination buffer */
+ vstrwq_p_f32(pOut0, vecMac0, p0);
+ vstrwq_p_f32(pOut1, vecMac1, p0);
+ vstrwq_p_f32(pOut2, vecMac2, p0);
+ vstrwq_p_f32(pOut3, vecMac3, p0);
+ }
+
+ /* move to next rows */
+ pInA += 4 * numColsA;
+ pOut += 4 * numColsB;
+ i--;
+ }
+
+ /*
+ * non multiple of 4 rows for Matrix A
+ * process single row
+ */
+ if (numRowsA & 3)
+ {
+ i = numRowsA & 3;
+ while (i > 0U)
+ {
+ float32_t *pInA0;
+ float32_t *pInB0;
+ float32_t *pOut0;
+ f32x4_t vecInB;
+ f32x4_t vecMac0;
+
+ pOut0 = pOut;
+ pInB0 = pInB;
+
+ uint32_t k = numColsB >> 2;
+ while (k > 0U)
+ {
+ pInA0 = pInA;
+
+ vecMac0 = vdupq_n_f32(0.0f);
+ blkCnt = numColsA;
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+ */
+ vecInB = *(f32x4_t *)pInB0; /* vldrwq_f32(pInB0, 0); */
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /* Store the results (1 x 4 block) in the destination buffer */
+ vst1q(pOut0, vecMac0);
+ pOut0 += 4;
+
+ /*
+ * rewind
+ */
+ pInB0 -= (numColsB * numColsA) - 4;
+ k--;
+ }
+
+ int colBLeft = numColsB & 3;
+ if (colBLeft)
+ {
+ pInA0 = pInA;
+ mve_pred16_t p0 = vctp32q(colBLeft);
+
+ vecMac0 = vdupq_n_f32(0.0f);
+ blkCnt = numColsA;
+ while (blkCnt > 0U)
+ {
+ /*
+ * load {bi,4n+0, bi,4n+1, bi,4n+2, bi,4n+3}
+ */
+ vecInB = vldrwq_z_f32(pInB0, p0);
+
+ vecMac0 = vfmaq(vecMac0, vecInB, *pInA0++);
+
+ pInB0 = pInB0 + numColsB;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /* Store the results (1 x colBLeft block) in the destination buffer */
+ vstrwq_p_f32(pOut0, vecMac0, p0);
+ }
+
+ /* move to next row */
+ pInA += 1 * numColsA;
+ pOut += 1 * numColsB;
+ i--;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+
+#if defined(ARM_MATH_NEON)
+
+#define GROUPOFROWS 8
+
+arm_status arm_mat_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ float32_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+
+
+ uint16_t col, i = 0U, j, row = numRowsA, rowCnt, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+ float32x4_t a0V, a1V, a2V, a3V, a4V, a5V, a6V, a7V;
+ float32x4_t acc0,acc1,acc2,acc3,acc4,acc5,acc6,acc7,temp;
+ float32x2_t accum = vdup_n_f32(0);
+ float32_t *pIn1B = pSrcA->pData;
+ float32_t *pIn1C = pSrcA->pData;
+ float32_t *pIn1D = pSrcA->pData;
+ float32_t *pIn1E = pSrcA->pData;
+ float32_t *pIn1F = pSrcA->pData;
+ float32_t *pIn1G = pSrcA->pData;
+ float32_t *pIn1H = pSrcA->pData;
+
+ float32_t *pxB,*pxC, *pxD, *pxE, *pxF, *pxG, *pxH; /* Temporary output data matrix pointer */
+ float32_t sum0,sum1, sum2,sum3, sum4, sum5 , sum6, sum7;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* Row loop */
+ rowCnt = row >> 3;
+
+ while(rowCnt > 0)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + GROUPOFROWS*i;
+ pxB = px + numColsB;
+ pxC = px + 2*numColsB;
+ pxD = px + 3*numColsB;
+ pxE = px + 4*numColsB;
+ pxF = px + 5*numColsB;
+ pxG = px + 6*numColsB;
+ pxH = px + 7*numColsB;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum0 = 0.0f;
+ sum1 = 0.0f;
+ sum2 = 0.0f;
+ sum3 = 0.0f;
+ sum4 = 0.0f;
+ sum5 = 0.0f;
+ sum6 = 0.0f;
+ sum7 = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+ pIn1B = pIn1 + numColsA;
+ pIn1C = pIn1 + 2*numColsA;
+ pIn1D = pIn1 + 3*numColsA;
+ pIn1E = pIn1 + 4*numColsA;
+ pIn1F = pIn1 + 5*numColsA;
+ pIn1G = pIn1 + 6*numColsA;
+ pIn1H = pIn1 + 7*numColsA;
+
+ acc0 = vdupq_n_f32(0.0);
+ acc1 = vdupq_n_f32(0.0);
+ acc2 = vdupq_n_f32(0.0);
+ acc3 = vdupq_n_f32(0.0);
+ acc4 = vdupq_n_f32(0.0);
+ acc5 = vdupq_n_f32(0.0);
+ acc6 = vdupq_n_f32(0.0);
+ acc7 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f32(pIn1);
+ a1V = vld1q_f32(pIn1B);
+ a2V = vld1q_f32(pIn1C);
+ a3V = vld1q_f32(pIn1D);
+ a4V = vld1q_f32(pIn1E);
+ a5V = vld1q_f32(pIn1F);
+ a6V = vld1q_f32(pIn1G);
+ a7V = vld1q_f32(pIn1H);
+
+ pIn1 += 4;
+ pIn1B += 4;
+ pIn1C += 4;
+ pIn1D += 4;
+ pIn1E += 4;
+ pIn1F += 4;
+ pIn1G += 4;
+ pIn1H += 4;
+
+ temp = vsetq_lane_f32(*pIn2,temp,0);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,1);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,2);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,3);
+ pIn2 += numColsB;
+
+ acc0 = vmlaq_f32(acc0,a0V,temp);
+ acc1 = vmlaq_f32(acc1,a1V,temp);
+ acc2 = vmlaq_f32(acc2,a2V,temp);
+ acc3 = vmlaq_f32(acc3,a3V,temp);
+ acc4 = vmlaq_f32(acc4,a4V,temp);
+ acc5 = vmlaq_f32(acc5,a5V,temp);
+ acc6 = vmlaq_f32(acc6,a6V,temp);
+ acc7 = vmlaq_f32(acc7,a7V,temp);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum0 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc1), vget_high_f32(acc1));
+ sum1 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc2), vget_high_f32(acc2));
+ sum2 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc3), vget_high_f32(acc3));
+ sum3 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc4), vget_high_f32(acc4));
+ sum4 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc5), vget_high_f32(acc5));
+ sum5 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc6), vget_high_f32(acc6));
+ sum6 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ accum = vpadd_f32(vget_low_f32(acc7), vget_high_f32(acc7));
+ sum7 += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA & 3;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ sum0 += *pIn1++ * (*pIn2);
+ sum1 += *pIn1B++ * (*pIn2);
+ sum2 += *pIn1C++ * (*pIn2);
+ sum3 += *pIn1D++ * (*pIn2);
+ sum4 += *pIn1E++ * (*pIn2);
+ sum5 += *pIn1F++ * (*pIn2);
+ sum6 += *pIn1G++ * (*pIn2);
+ sum7 += *pIn1H++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum0;
+ *pxB++ = sum1;
+ *pxC++ = sum2;
+ *pxD++ = sum3;
+ *pxE++ = sum4;
+ *pxF++ = sum5;
+ *pxG++ = sum6;
+ *pxH++ = sum7;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + GROUPOFROWS*numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+ }
+
+ /*
+
+ i was the index of a group of rows computed by previous loop.
+ Now i is the index of a row since below code is computing row per row
+ and no more group of row per group of rows.
+
+ */
+
+ i = GROUPOFROWS*i;
+ rowCnt = row & 7;
+
+ while(rowCnt > 0)
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0U;
+
+ /* Column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+
+ acc0 = vdupq_n_f32(0.0);
+
+ /* Compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* Matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ a0V = vld1q_f32(pIn1); // load & separate real/imag pSrcA (de-interleave 2)
+ pIn1 += 4;
+
+ temp = vsetq_lane_f32(*pIn2,temp,0);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,1);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,2);
+ pIn2 += numColsB;
+ temp = vsetq_lane_f32(*pIn2,temp,3);
+ pIn2 += numColsB;
+
+ acc0 = vmlaq_f32(acc0,a0V,temp);
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ accum = vpadd_f32(vget_low_f32(acc0), vget_high_f32(acc0));
+ sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1);
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4U;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2)*b(2,1) + ... + a(m,p)*b(p,n) */
+ sum += *pIn1++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ float32_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,p) = a(m,1) * b(1,p) + a(m,2) * b(2,p) + .... + a(m,n) * b(n,p) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sum;
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c
new file mode 100644
index 0000000000..d9d9fa184d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_f64.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_f64.c
+ * Description: Floating-point matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an M x N matrix with an N x P matrix results
+ * in an M x P matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * pSrcA and pSrcB are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of pSrcA and pSrcB.
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+
+arm_status arm_mat_mult_f64(
+ const arm_matrix_instance_f64 * pSrcA,
+ const arm_matrix_instance_f64 * pSrcB,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ float64_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ float64_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ float64_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ float64_t *pOut = pDst->pData; /* Output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ float64_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint64_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = sum;
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
new file mode 100644
index 0000000000..62ddcaf8fc
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
@@ -0,0 +1,483 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_fast_q15.c
+ * Description: Q15 matrix multiplication (fast variant)
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ @brief Q15 matrix multiplication (fast variant).
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @param[in] pState points to the array for storing intermediate results
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The difference between the function \ref arm_mat_mult_q15() and this fast variant is that
+ the fast variant use a 32-bit rather than a 64-bit accumulator.
+ The result of each 1.15 x 1.15 multiplication is truncated to
+ 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ format. Finally, the accumulator is saturated and converted to a 1.15 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides
+ less precision since it discards the low 16 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
+ as a total of numColsA additions are computed internally for each output element.
+ @remark
+ Refer to \ref arm_mat_mult_q15() for a slower implementation of this function
+ which uses 64-bit accumulation to provide higher precision.
+ */
+
+arm_status arm_mat_mult_fast_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
+{
+ q31_t sum; /* Accumulator */
+ q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#if defined (ARM_MATH_DSP)
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t inA1, inB1, inA2, inB2;
+ q31_t sum2, sum3, sum4;
+ q15_t *pInA2, *pInB2, *px2;
+ uint32_t j = 0;
+#else
+ q15_t in; /* Temporary variable to hold the input value */
+ q15_t inA1, inB1, inA2, inB2;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose */
+ do
+ {
+ /* The pointer px is set to starting address of column being processed */
+ px = pSrcBT + i;
+
+ /* Apply loop unrolling and exchange columns with row elements */
+ col = numColsB >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (col > 0U)
+ {
+
+#if defined (ARM_MATH_DSP)
+
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) in;
+#else
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store second element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *px = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ in = read_q15x2_ia ((q15_t **) &pInB);
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) in;
+#else
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += numRowsB;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *px = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += numRowsB;
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ /* Read one element from row */
+ in = *pInB++;
+
+ /* Store one element in destination */
+ *px = in;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ in = *pInB++;
+ *px = in;
+ px += numRowsB;
+
+ in = *pInB++;
+ *px = in;
+ px += numRowsB;
+
+ in = *pInB++;
+ *px = in;
+ px += numRowsB;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ col = numColsB % 0x4U;
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pInB++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Reset variables for usage in following multiplication process */
+ row = numRowsA;
+ i = 0U;
+ px = pDst->pData;
+
+#if defined (ARM_MATH_DSP)
+ /* Process two rows from matrix A at a time and output two rows at a time */
+ row = row >> 1U;
+ px2 = px + numColsB;
+#endif
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ while (row > 0U)
+ {
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
+ pInB = pSrcBT;
+
+#if defined (ARM_MATH_DSP)
+ /* Process two (transposed) columns from matrix B at a time */
+ col = col >> 1U;
+ j = 0;
+#endif
+
+ /* column loop */
+ while (col > 0U)
+ {
+ /* Set variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate pointer pInA to point to starting address of column being processed */
+ pInA = pSrcA->pData + i;
+
+#if defined (ARM_MATH_DSP)
+ sum2 = 0;
+ sum3 = 0;
+ sum4 = 0;
+ pInB = pSrcBT + j;
+ pInA2 = pInA + numColsA;
+ pInB2 = pInB + numRowsB;
+
+ /* Read in two elements at once - allows dual MAC instruction */
+ colCnt = numColsA >> 1U;
+#else
+ colCnt = numColsA >> 2U;
+#endif
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+#if defined (ARM_MATH_DSP)
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+
+ inA2 = read_q15x2_ia ((q15_t **) &pInA2);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB2);
+
+ /* Multiply and Accumulates */
+ sum = __SMLAD(inA1, inB1, sum);
+ sum2 = __SMLAD(inA1, inB2, sum2);
+ sum3 = __SMLAD(inA2, inB1, sum3);
+ sum4 = __SMLAD(inA2, inB2, sum4);
+#else
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ /* Multiply and Accumulates */
+ sum += inA1 * inB1;
+
+ inA2 = *pInA++;
+ inB2 = *pInB++;
+ sum += inA2 * inB2;
+
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ sum += inA1 * inB1;
+
+ inA2 = *pInA++;
+ inB2 = *pInB++;
+ sum += inA2 * inB2;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* process odd column samples */
+#if defined (ARM_MATH_DSP)
+ if (numColsA & 1U) {
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ inA2 = *pInA2++;
+ inB2 = *pInB2++;
+ sum += inA1 * inB1;
+ sum2 += inA1 * inB2;
+ sum3 += inA2 * inB1;
+ sum4 += inA2 * inB2;
+ }
+#else
+ colCnt = numColsA % 0x4U;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ sum += (q31_t) *pInA++ * *pInB++;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Saturate and store result in destination buffer */
+ *px++ = (q15_t) (sum >> 15);
+
+#if defined (ARM_MATH_DSP)
+ *px++ = (q15_t) (sum2 >> 15);
+ *px2++ = (q15_t) (sum3 >> 15);
+ *px2++ = (q15_t) (sum4 >> 15);
+ j += numRowsB * 2;
+#endif
+
+ /* Decrement column loop counter */
+ col--;
+
+ }
+
+ i = i + numColsA;
+
+#if defined (ARM_MATH_DSP)
+ i = i + numColsA;
+ px = px2 + (numColsB & 1U);
+ px2 = px + numColsB;
+#endif
+
+ /* Decrement row loop counter */
+ row--;
+
+ }
+
+ /* Compute any remaining odd row/column below */
+
+#if defined (ARM_MATH_DSP)
+
+ /* Compute remaining output column */
+ if (numColsB & 1U) {
+
+ /* Avoid redundant computation of last element */
+ row = numRowsA & (~0x1);
+
+ /* Point to remaining unfilled column in output matrix */
+ px = pDst->pData + numColsB-1;
+ pInA = pSrcA->pData;
+
+ /* row loop */
+ while (row > 0)
+ {
+
+ /* point to last column in matrix B */
+ pInB = pSrcBT + numRowsB * (numColsB-1);
+
+ /* Set variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Compute 4 columns at once */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
+
+ sum = __SMLAD(inA1, inB1, sum);
+ sum = __SMLAD(inA2, inB2, sum);
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ colCnt = numColsA & 3U;
+ while (colCnt > 0U) {
+ sum += (q31_t) (*pInA++) * (*pInB++);
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px = (q15_t) (sum >> 15);
+ px += numColsB;
+
+ /* Decrement row loop counter */
+ row--;
+ }
+ }
+
+ /* Compute remaining output row */
+ if (numRowsA & 1U) {
+
+ /* point to last row in output matrix */
+ px = pDst->pData + (numColsB) * (numRowsA-1);
+
+ pInB = pSrcBT;
+ col = numColsB;
+ i = 0U;
+
+ /* col loop */
+ while (col > 0)
+ {
+ /* point to last row in matrix A */
+ pInA = pSrcA->pData + (numRowsA-1) * numColsA;
+
+ /* Set variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Compute 4 columns at once */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
+
+ sum = __SMLAD(inA1, inB1, sum);
+ sum = __SMLAD(inA2, inB2, sum);
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ colCnt = numColsA % 4U;
+ while (colCnt > 0U) {
+ sum += (q31_t) (*pInA++) * (*pInB++);
+
+ colCnt--;
+ }
+
+ /* Store result in destination buffer */
+ *px++ = (q15_t) (sum >> 15);
+
+ /* Decrement column loop counter */
+ col--;
+ }
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
new file mode 100644
index 0000000000..90841e0d56
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
@@ -0,0 +1,374 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_fast_q31.c
+ * Description: Q31 matrix multiplication (fast variant)
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ @brief Q31 matrix multiplication (fast variant).
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The difference between the function \ref arm_mat_mult_q31() and this fast variant is that
+ the fast variant use a 32-bit rather than a 64-bit accumulator.
+ The result of each 1.31 x 1.31 multiplication is truncated to
+ 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ format. Finally, the accumulator is saturated and converted to a 1.31 result.
+ @par
+ The fast version has the same overflow behavior as the standard version but provides
+ less precision since it discards the low 32 bits of each multiplication result.
+ In order to avoid overflows completely the input signals must be scaled down.
+ Scale down one of the input matrices by log2(numColsA) bits to avoid overflows,
+ as a total of numColsA additions are computed internally for each output element.
+ @remark
+ Refer to \ref arm_mat_mult_q31() for a slower implementation of this function
+ which uses 64-bit accumulation to provide higher precision.
+ */
+
+arm_status arm_mat_mult_fast_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA2;
+ q31_t *px; /* Temporary output data matrix pointer */
+ q31_t *px2;
+ q31_t sum1, sum2, sum3, sum4; /* Accumulator */
+ q31_t inA1, inA2, inB1, inB2;
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, j, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ px = pDst->pData;
+
+ row = row >> 1U;
+ px2 = px + numColsB;
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ while (row > 0U)
+ {
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pInB = pSrcB->pData;
+
+ j = 0U;
+
+ col = col >> 1U;
+
+ /* column loop */
+ while (col > 0U)
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum1 = 0;
+ sum2 = 0;
+ sum3 = 0;
+ sum4 = 0;
+
+ /* Initiate data pointers */
+ pInA = pSrcA->pData + i;
+ pInB = pSrcB->pData + j;
+ pInA2 = pInA + numColsA;
+
+ colCnt = numColsA;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ inA1 = *pInA++;
+ inB1 = pInB[0];
+ inA2 = *pInA2++;
+ inB2 = pInB[1];
+ pInB += numColsB;
+
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
+ sum2 = __SMMLA(inA1, inB2, sum2);
+ sum3 = __SMMLA(inA2, inB1, sum3);
+ sum4 = __SMMLA(inA2, inB2, sum4);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum2 = (q31_t) ((((q63_t) sum2 << 32) + ((q63_t) inA1 * inB2)) >> 32);
+ sum3 = (q31_t) ((((q63_t) sum3 << 32) + ((q63_t) inA2 * inB1)) >> 32);
+ sum4 = (q31_t) ((((q63_t) sum4 << 32) + ((q63_t) inA2 * inB2)) >> 32);
+#endif
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
+ *px++ = sum1 << 1;
+ *px++ = sum2 << 1;
+ *px2++ = sum3 << 1;
+ *px2++ = sum4 << 1;
+
+ j += 2;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i = i + (numColsA << 1U);
+ px = px2 + (numColsB & 1U);
+ px2 = px + numColsB;
+
+ /* Decrement row loop counter */
+ row--;
+ }
+
+ /* Compute any remaining odd row/column below */
+
+ /* Compute remaining output column */
+ if (numColsB & 1U) {
+
+ /* Avoid redundant computation of last element */
+ row = numRowsA & (~1U);
+
+ /* Point to remaining unfilled column in output matrix */
+ px = pDst->pData + numColsB-1;
+ pInA = pSrcA->pData;
+
+ /* row loop */
+ while (row > 0)
+ {
+
+ /* point to last column in matrix B */
+ pInB = pSrcB->pData + numColsB-1;
+
+ /* Set variable sum1, that acts as accumulator, to zero */
+ sum1 = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 columns at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining column */
+ colCnt = numColsA % 4U;
+
+#else
+
+ /* Initialize colCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U) {
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+ colCnt--;
+ }
+
+ /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
+ *px = sum1 << 1;
+ px += numColsB;
+
+ /* Decrement row loop counter */
+ row--;
+ }
+ }
+
+ /* Compute remaining output row */
+ if (numRowsA & 1U) {
+
+ /* point to last row in output matrix */
+ px = pDst->pData + (numColsB) * (numRowsA-1);
+
+ col = numColsB;
+ i = 0U;
+
+ /* col loop */
+ while (col > 0)
+ {
+
+ /* point to last row in matrix A */
+ pInA = pSrcA->pData + (numRowsA-1) * numColsA;
+ pInB = pSrcB->pData + i;
+
+ /* Set variable sum1, that acts as accumulator, to zero */
+ sum1 = 0;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 columns at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ inA1 = *pInA++;
+ inA2 = *pInA++;
+ inB1 = *pInB;
+ pInB += numColsB;
+ inB2 = *pInB;
+ pInB += numColsB;
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
+ sum1 = __SMMLA(inA2, inB2, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
+#endif
+
+ inA1 = *pInA++;
+ inA2 = *pInA++;
+ inB1 = *pInB;
+ pInB += numColsB;
+ inB2 = *pInB;
+ pInB += numColsB;
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(inA1, inB1, sum1);
+ sum1 = __SMMLA(inA2, inB2, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) inA2 * inB2)) >> 32);
+#endif
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining column */
+ colCnt = numColsA % 4U;
+
+#else
+
+ /* Initialize colCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U) {
+#if defined (ARM_MATH_DSP)
+ sum1 = __SMMLA(*pInA++, *pInB, sum1);
+#else
+ sum1 = (q31_t) ((((q63_t) sum1 << 32) + ((q63_t) *pInA++ * *pInB)) >> 32);
+#endif
+ pInB += numColsB;
+
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = sum1 << 1;
+ i++;
+
+ /* Decrement col loop counter */
+ col--;
+ }
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
new file mode 100644
index 0000000000..8eed6ee5d6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q15.c
@@ -0,0 +1,890 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_q15.c
+ * Description: Q15 matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ @brief Q15 matrix multiplication.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @param[in] pState points to the array for storing intermediate results (Unused)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator. The inputs to the
+ multiplications are in 1.15 format and multiplications yield a 2.30 result.
+ The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ This approach provides 33 guard bits and there is no risk of overflow.
+ The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits
+ and then saturated to 1.15 format.
+ @par
+ Refer to \ref arm_mat_mult_fast_q15() for a faster but less precise version of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MVE_ASRL_SAT16(acc, shift) ((sqrshrl_sat48(acc, -(32-shift)) >> 32) & 0xffffffff)
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_mult_q15_2x2_mve(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs;
+ q15_t *pInA0 = pInA;
+ q15_t *pInA1 = pInA0 + MATRIX_DIM2;
+ q63_t acc0, acc1;
+ q15x8_t vecB, vecA0, vecA1;
+ mve_pred16_t p0 = vctp16q(MATRIX_DIM2);
+
+ vecColBOffs = vidupq_u16((uint32_t)0, 2); /* MATRIX_DIM2 */
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
+
+ vecA0 = vldrhq_s16(pInA0);
+ vecA1 = vldrhq_s16(pInA1);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+
+ pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+
+ pOut[0 * MATRIX_DIM2] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM2] = (q15_t) __SSAT(acc1, 16);
+
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_q15_3x3_mve(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs;
+ q15_t *pInA0 = pInA;
+ q15_t *pInA1 = pInA0 + MATRIX_DIM3;
+ q15_t *pInA2 = pInA1 + MATRIX_DIM3;
+ q63_t acc0, acc1, acc2;
+ q15x8_t vecB, vecA0, vecA1, vecA2;
+ mve_pred16_t p0 = vctp16q(MATRIX_DIM3);
+
+ vecColBOffs = vidupq_u16((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM3;
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
+
+ vecA0 = vldrhq_s16(pInA0);
+ vecA1 = vldrhq_s16(pInA1);
+ vecA2 = vldrhq_s16(pInA2);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+
+ pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+
+ pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+
+ pOut[0 * MATRIX_DIM3] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM3] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM3] = (q15_t) __SSAT(acc2, 16);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_INLINE arm_status arm_mat_mult_q15_4x4_mve(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16x8_t vecColBOffs;
+ q15_t *pInA0 = pInA;
+ q15_t *pInA1 = pInA0 + MATRIX_DIM4;
+ q15_t *pInA2 = pInA1 + MATRIX_DIM4;
+ q15_t *pInA3 = pInA2 + MATRIX_DIM4;
+ q63_t acc0, acc1, acc2, acc3;
+ q15x8_t vecB, vecA0, vecA1, vecA2, vecA3;
+ mve_pred16_t p0 = vctp16q(MATRIX_DIM4);
+
+ vecColBOffs = vidupq_u16((uint32_t)0, 4);
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16((q15_t const *)pInB, vecColBOffs, p0);
+
+ vecA0 = vldrhq_s16(pInA0);
+ vecA1 = vldrhq_s16(pInA1);
+ vecA2 = vldrhq_s16(pInA2);
+ vecA3 = vldrhq_s16(pInA3);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+ acc3 = vmlaldavq(vecA3, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+ acc3 = asrl(acc3, 15);
+
+ pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
+ pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+ acc3 = vmlaldavq(vecA3, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+ acc3 = asrl(acc3, 15);
+
+ pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
+ pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+ acc3 = vmlaldavq(vecA3, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+ acc3 = asrl(acc3, 15);
+
+ pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
+ pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrhq_gather_shifted_offset_z_s16(pInB, vecColBOffs, p0);
+
+ acc0 = vmlaldavq(vecA0, vecB);
+ acc1 = vmlaldavq(vecA1, vecB);
+ acc2 = vmlaldavq(vecA2, vecB);
+ acc3 = vmlaldavq(vecA3, vecB);
+
+ acc0 = asrl(acc0, 15);
+ acc1 = asrl(acc1, 15);
+ acc2 = asrl(acc2, 15);
+ acc3 = asrl(acc3, 15);
+
+ pOut[0 * MATRIX_DIM4] = (q15_t) __SSAT(acc0, 16);
+ pOut[1 * MATRIX_DIM4] = (q15_t) __SSAT(acc1, 16);
+ pOut[2 * MATRIX_DIM4] = (q15_t) __SSAT(acc2, 16);
+ pOut[3 * MATRIX_DIM4] = (q15_t) __SSAT(acc3, 16);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_mult_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
+{
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ uint16x8_t vecOffs, vecColBOffs;
+ uint32_t blkCnt,rowCnt; /* loop counters */
+ arm_status status; /* Status of matrix multiplication */
+ (void)pState;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+
+ if (numRowsA == 1)
+ {
+ q63_t sum;
+ sum = pInA[0] * pInB[0];
+ pOut[0] = (q15_t) __SSAT((sum >> 15), 16);
+ return (ARM_MATH_SUCCESS);
+ }
+ else if(numRowsA == 2)
+ return arm_mat_mult_q15_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_q15_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_mult_q15_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ vecColBOffs = vidupq_u16((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * (uint16_t) (numColsB);
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 2;
+ while (rowCnt > 0U)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i;
+ i = i + 4 * numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q15_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+ q15_t *pInA0 = pInA;
+ q15_t *pInA1 = pInA0 + numColsA;
+ q15_t *pInA2 = pInA1 + numColsA;
+ q15_t *pInA3 = pInA2 + numColsA;
+ q63_t acc0, acc1, acc2, acc3;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ pSrcA0Vec = (q15_t const *) pInA0;
+ pSrcA1Vec = (q15_t const *) pInA1;
+ pSrcA2Vec = (q15_t const *) pInA2;
+ pSrcA3Vec = (q15_t const *) pInA3;
+
+ vecOffs = vecColBOffs;
+
+ blkCnt = (numColsA) >> 3;
+ while (blkCnt > 0U)
+ {
+ q15x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
+ vecOffs = vecOffs + (uint16_t) (numColsB * 8);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 8;
+ acc0 = vmlaldavaq(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 8;
+ acc1 = vmlaldavaq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 8;
+ acc2 = vmlaldavaq(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 8;
+ acc3 = vmlaldavaq(acc3, vecA, vecB);
+ blkCnt--;
+
+ }
+ /*
+ * tail
+ */
+ blkCnt = numColsA & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ q15x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
+ vecOffs = vecOffs + (uint16_t) (numColsB * 8);
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vmlaldavaq_p(acc1, vecA, vecB, p0);
+ vecA = vld1q(pSrcA2Vec);
+ acc2 = vmlaldavaq_p(acc2, vecA, vecB, p0);
+ vecA = vld1q(pSrcA3Vec);
+ acc3 = vmlaldavaq_p(acc3, vecA, vecB, p0);
+ }
+
+ px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
+ px[1 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc1, 15);
+ px[2 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc2, 15);
+ px[3 * numColsB] = (q15_t)MVE_ASRL_SAT16(acc3, 15);
+ px++;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = pSrcB->pData + (numColsB - col);
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 4);
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+
+ rowCnt = row & 3;
+ while (rowCnt > 0U)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q15_t const *pSrcA0Vec;
+ q15_t *pInA0 = pInA;
+ q63_t acc0;
+
+ acc0 = 0LL;
+
+ pSrcA0Vec = (q15_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ blkCnt = (numColsA) >> 3;
+ while (blkCnt > 0U)
+ {
+ q15x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
+ vecOffs = vecOffs + (uint16_t) (numColsB * 8);
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vmlaldavaq(acc0, vecA, vecB);
+
+ blkCnt--;
+
+ }
+ /*
+ * tail
+ */
+ blkCnt = numColsA & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ q15x8_t vecB, vecA;
+
+ vecB = vldrhq_gather_shifted_offset((int16_t const *)pInB, vecOffs);
+ vecOffs = vecOffs + (uint16_t) (numColsB * 8);
+
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vmlaldavaq_p(acc0, vecA, vecB, p0);
+
+ }
+
+ px[0] = (q15_t)MVE_ASRL_SAT16(acc0, 15);
+
+ px++;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = pSrcB->pData + (numColsB - col);
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA );
+ rowCnt--;
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+#else
+arm_status arm_mat_mult_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
+{
+ q63_t sum; /* Accumulator */
+
+#if defined (ARM_MATH_DSP) /* != CM0 */
+
+ q15_t *pSrcBT = pState; /* Input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* Number of rows of input matrix B */
+ uint32_t col, i = 0U, row = numRowsB, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t inA1, inB1, inA2, inB2;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose */
+ do
+ {
+ /* The pointer px is set to starting address of column being processed */
+ px = pSrcBT + i;
+
+ /* Apply loop unrolling and exchange columns with row elements */
+ col = numColsB >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (col > 0U)
+ {
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) in;
+#else
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store second element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *px = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) in;
+#else
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += numRowsB;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *px = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += numRowsB;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ col = numColsB % 0x4U;
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pInB++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += numRowsB;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Reset variables for usage in following multiplication process */
+ row = numRowsA;
+ i = 0U;
+ px = pDst->pData;
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of transposed pSrcB data */
+ pInB = pSrcBT;
+
+ /* column loop */
+ do
+ {
+ /* Set variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate pointer pInA to point to starting address of column being processed */
+ pInA = pSrcA->pData + i;
+
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = read_q15x2_ia ((q15_t **) &pInA);
+ inB1 = read_q15x2_ia ((q15_t **) &pInB);
+
+ inA2 = read_q15x2_ia ((q15_t **) &pInA);
+ inB2 = read_q15x2_ia ((q15_t **) &pInB);
+
+ /* Multiply and Accumulates */
+ sum = __SMLALD(inA1, inB1, sum);
+ sum = __SMLALD(inA2, inB2, sum);
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* process remaining column samples */
+ colCnt = numColsA % 0x4U;
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+ sum += *pInA++ * *pInB++;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Saturate and store result in destination buffer */
+ *px = (q15_t) (__SSAT((sum >> 15), 16));
+ px++;
+
+ /* Decrement column loop counter */
+ col--;
+
+ } while (col > 0U);
+
+ i = i + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ q15_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q15_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* Input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* Input data matrix pointer B of Q15 type */
+ q15_t *pOut = pDst->pData; /* Output data matrix pointer */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+ (void)pState;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate pointer pIn1 to point to starting address of pSrcA */
+ pIn1 = pInA;
+
+ /* Matrix A columns number of MAC operations are to be performed */
+ colCnt = numColsA;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform multiply-accumulates */
+ sum += (q31_t) * pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Convert result from 34.30 to 1.15 format and store saturated value in destination buffer */
+
+ /* Saturate and store result in destination buffer */
+ *px++ = (q15_t) __SSAT((sum >> 15), 16);
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pSrcA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
new file mode 100644
index 0000000000..67249deb1d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q31.c
@@ -0,0 +1,761 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_q31.c
+ * Description: Q31 matrix multiplication
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ @brief Q31 matrix multiplication.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The accumulator has a 2.62 format and maintains full precision of the intermediate
+ multiplication results but provides only a single guard bit. There is no saturation
+ on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ distorts the result. The input signals should be scaled down to avoid intermediate
+ overflows. The input is thus scaled down by log2(numColsA) bits
+ to avoid overflows, as a total of numColsA additions are performed internally.
+ The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ @remark
+ Refer to \ref arm_mat_mult_fast_q31() for a faster but less precise implementation of this function.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#define MATRIX_DIM2 2
+#define MATRIX_DIM3 3
+#define MATRIX_DIM4 4
+
+__STATIC_INLINE arm_status arm_mat_mult_q31_2x2_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM2;
+ q63_t acc0, acc1;
+ q31x4_t vecB, vecA0, vecA1;
+ /* enable predication to disable half of vector elements */
+ mve_pred16_t p0 = vctp32q(MATRIX_DIM2);
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM2;
+
+ pInB = pSrcB->pData;
+
+ /* load 1st B column (partial load) */
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ /* load A rows */
+ vecA0 = vldrwq_s32(pInA0);
+ vecA1 = vldrwq_s32(pInA1);
+
+ acc0 = vrmlaldavhq(vecA0, vecB);
+ acc1 = vrmlaldavhq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ acc0 = vrmlaldavhq(vecA0, vecB);
+ acc1 = vrmlaldavhq(vecA1, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+
+ pOut[0 * MATRIX_DIM2] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM2] = (q31_t) acc1;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+
+__STATIC_INLINE arm_status arm_mat_mult_q31_3x3_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM3;
+ q31_t *pInA2 = pInA1 + MATRIX_DIM3;
+ q63_t acc0, acc1, acc2;
+ q31x4_t vecB, vecA;
+ /* enable predication to disable last (4th) vector element */
+ mve_pred16_t p0 = vctp32q(MATRIX_DIM3);
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM3;
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_z_s32(pInB, vecColBOffs, p0);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+
+ pOut[0 * MATRIX_DIM3] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM3] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM3] = (q31_t) acc2;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+__STATIC_INLINE arm_status arm_mat_mult_q31_4x4_mve(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32x4_t vecColBOffs;
+ q31_t *pInA0 = pInA;
+ q31_t *pInA1 = pInA0 + MATRIX_DIM4;
+ q31_t *pInA2 = pInA1 + MATRIX_DIM4;
+ q31_t *pInA3 = pInA2 + MATRIX_DIM4;
+ q63_t acc0, acc1, acc2, acc3;
+ q31x4_t vecB, vecA;
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 4);
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrwq_gather_shifted_offset_s32(pInB, vecColBOffs);
+
+ vecA = vldrwq_s32(pInA0);
+ acc0 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA1);
+ acc1 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA2);
+ acc2 = vrmlaldavhq(vecA, vecB);
+ vecA = vldrwq_s32(pInA3);
+ acc3 = vrmlaldavhq(vecA, vecB);
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ pOut[0 * MATRIX_DIM4] = (q31_t) acc0;
+ pOut[1 * MATRIX_DIM4] = (q31_t) acc1;
+ pOut[2 * MATRIX_DIM4] = (q31_t) acc2;
+ pOut[3 * MATRIX_DIM4] = (q31_t) acc3;
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_mult_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t const *pInB = (q31_t const *)pSrcB->pData; /* input data matrix pointer B */
+ q31_t const *pInA = (q31_t const *)pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ uint32x4_t vecOffs, vecColBOffs;
+ uint32_t blkCnt, rowCnt; /* loop counters */
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+ if (numRowsA == 1)
+ {
+ q63_t sum = (q63_t) *pInA * *pInB;
+ pOut[0] = (q31_t)(sum >> 31);
+ return (ARM_MATH_SUCCESS);
+ }
+ else if(numRowsA == 2)
+ return arm_mat_mult_q31_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_q31_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_mult_q31_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+
+ vecColBOffs = vidupq_u32((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * (uint32_t) (numColsB);
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ rowCnt = row >> 2;
+ while (rowCnt > 0U)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i;
+ i = i + 4 * numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *)pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q31_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec;
+ q31_t const *pInA0 = pInA;
+ q31_t const *pInA1 = pInA0 + numColsA;
+ q31_t const *pInA2 = pInA1 + numColsA;
+ q31_t const *pInA3 = pInA2 + numColsA;
+ q63_t acc0, acc1, acc2, acc3;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+ acc3 = 0LL;
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+ pSrcA1Vec = (q31_t const *) pInA1;
+ pSrcA2Vec = (q31_t const *) pInA2;
+ pSrcA3Vec = (q31_t const *) pInA3;
+
+ vecOffs = vecColBOffs;
+
+ /* process 1 x 4 block output */
+ blkCnt = numColsA >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /* move Matrix B read offsets, 4 rows down */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
+ acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
+ acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
+ acc3 = vrmlaldavhaq(acc3, vecA, vecB);
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numColsA & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+ //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+ vecA = vld1q(pSrcA1Vec); pSrcA1Vec += 4;
+ acc1 = vrmlaldavhaq(acc1, vecA, vecB);
+ vecA = vld1q(pSrcA2Vec); pSrcA2Vec += 4;
+ acc2 = vrmlaldavhaq(acc2, vecA, vecB);
+ vecA = vld1q(pSrcA3Vec); pSrcA3Vec += 4;
+ acc3 = vrmlaldavhaq(acc3, vecA, vecB);
+ }
+
+ acc0 = asrl(acc0, 23);
+ acc1 = asrl(acc1, 23);
+ acc2 = asrl(acc2, 23);
+ acc3 = asrl(acc3, 23);
+
+ px[0] = (q31_t) acc0;
+ px[1 * numColsB] = (q31_t) acc1;
+ px[2 * numColsB] = (q31_t) acc2;
+ px[3 * numColsB] = (q31_t) acc3;
+ px++;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += (numColsA * 4);
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt --;
+
+ }
+ rowCnt = row & 3;
+ while (rowCnt > 0U)
+ {
+ /*
+ * Output pointer is set to starting address of the row being processed
+ */
+ px = pOut + i;
+ i = i + numColsB;
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB;
+ /*
+ * For every row wise process, the pInB pointer is set
+ * to the starting address of the pSrcB data
+ */
+ pInB = (q31_t const *)pSrcB->pData;
+ /*
+ * column loop
+ */
+ while (col > 0U)
+ {
+ /*
+ * generate 4 columns elements
+ */
+ /*
+ * Matrix A columns number of MAC operations are to be performed
+ */
+
+ q31_t const *pSrcA0Vec;
+ q31_t const *pInA0 = pInA;
+ q63_t acc0;
+
+ acc0 = 0LL;
+
+
+ pSrcA0Vec = (q31_t const *) pInA0;
+
+ vecOffs = vecColBOffs;
+
+ /* process 1 x 4 block output */
+ blkCnt = numColsA >> 2;
+ while (blkCnt > 0U)
+ {
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset(pInB, vecOffs);
+ /* move Matrix B read offsets, 4 rows down */
+ vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec); pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numColsA & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ q31x4_t vecB, vecA;
+
+ vecB = vldrwq_gather_shifted_offset_z(pInB, vecOffs, p0);
+ //vecOffs = vecOffs + (uint32_t) (numColsB * 4);
+
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vrmlaldavhaq(acc0, vecA, vecB);
+
+ }
+
+ acc0 = asrl(acc0, 23);
+
+
+ px[0] = (q31_t) acc0;
+ px++;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+ /*
+ * Update the pointer pInB to point to the starting address of the next column
+ */
+ pInB = (q31_t const *)pSrcB->pData + (numColsB - col);
+ }
+
+ /*
+ * Update the pointer pInA to point to the starting address of the next row
+ */
+ pInA += numColsA;
+ /*
+ * Decrement the row loop counter
+ */
+ rowCnt--;
+ }
+
+ /*
+ * set status as ARM_MATH_SUCCESS
+ */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_mult_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* Input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* Input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ q63_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* Number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* Number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* Number of columns of input matrix A */
+ uint32_t col, i = 0U, row = numRowsA, colCnt; /* Loop counters */
+ arm_status status; /* Status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, pIn2 pointer is set to starting address of pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initialize pointer pIn1 to point to starting address of column being processed */
+ pIn1 = pInA;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 MACs at a time. */
+ colCnt = numColsA >> 2U;
+
+ /* matrix multiplication */
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining MACs */
+ colCnt = numColsA % 0x4U;
+
+#else
+
+ /* Initialize cntCnt with number of columns */
+ colCnt = numColsA;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (colCnt > 0U)
+ {
+ /* c(m,n) = a(1,1) * b(1,1) + a(1,2) * b(2,1) + .... + a(m,p) * b(p,n) */
+
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) *pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement loop counter */
+ colCnt--;
+ }
+
+ /* Convert result from 2.62 to 1.31 format and store in destination buffer */
+ *px++ = (q31_t) (sum >> 31);
+
+ /* Decrement column loop counter */
+ col--;
+
+ /* Update pointer pIn2 to point to starting address of next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update pointer pInA to point to starting address of next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c
new file mode 100644
index 0000000000..55f9a8e99f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_mult_q7.c
@@ -0,0 +1,678 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_mult_q7.c
+ * Description: Q15 matrix multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixMult
+ @{
+ */
+
+/**
+ * @brief Q7 matrix multiplication
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @param[in] *pState points to the array for storing intermediate results (Unused in some versions)
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator saturated to 1.7 format.
+ *
+ *
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_2x2_mve(
+ const arm_matrix_instance_q7 * pSrcA,
+ const arm_matrix_instance_q7 * pSrcB,
+ arm_matrix_instance_q7 * pDst)
+{
+ const uint32_t MATRIX_DIM = 2;
+ q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
+ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q7_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint8x16_t vecColBOffs;
+ q7_t *pInA0 = pInA;
+ q7_t *pInA1 = pInA0 + MATRIX_DIM;
+ q31_t acc0, acc1;
+ q7x16_t vecB, vecA0, vecA1;
+ mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+ vecColBOffs = vidupq_u8((uint32_t)0, 2); /* MATRIX_DIM */
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ vecA0 = vldrbq_s8(pInA0);
+ vecA1 = vldrbq_s8(pInA1);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_3x3_mve(
+ const arm_matrix_instance_q7 * pSrcA,
+ const arm_matrix_instance_q7 * pSrcB,
+ arm_matrix_instance_q7 * pDst)
+{
+ const uint8_t MATRIX_DIM = 3;
+ q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
+ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q7_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint8x16_t vecColBOffs;
+ q7_t *pInA0 = pInA;
+ q7_t *pInA1 = pInA0 + MATRIX_DIM;
+ q7_t *pInA2 = pInA1 + MATRIX_DIM;
+ q31_t acc0, acc1, acc2;
+ q7x16_t vecB, vecA0, vecA1, vecA2;
+ mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+ vecColBOffs = vidupq_u8((uint32_t)0, 1);
+ vecColBOffs = vecColBOffs * MATRIX_DIM;
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ vecA0 = vldrbq_s8(pInA0);
+ vecA1 = vldrbq_s8(pInA1);
+ vecA2 = vldrbq_s8(pInA2);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+
+__STATIC_FORCEINLINE arm_status arm_mat_mult_q7_4x4_mve(
+ const arm_matrix_instance_q7 * pSrcA,
+ const arm_matrix_instance_q7 * pSrcB,
+ arm_matrix_instance_q7 * pDst)
+{
+ const uint32_t MATRIX_DIM = 4;
+ q7_t const *pInB = (q7_t const *)pSrcB->pData; /* input data matrix pointer B */
+ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q7_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint8x16_t vecColBOffs;
+ q7_t *pInA0 = pInA;
+ q7_t *pInA1 = pInA0 + MATRIX_DIM;
+ q7_t *pInA2 = pInA1 + MATRIX_DIM;
+ q7_t *pInA3 = pInA2 + MATRIX_DIM;
+ q31_t acc0, acc1, acc2, acc3;
+ q7x16_t vecB, vecA0, vecA1, vecA2, vecA3;
+ mve_pred16_t p0 = vctp8q(MATRIX_DIM);
+
+ vecColBOffs = vidupq_u8((uint32_t)0, 4);
+
+ pInB = pSrcB->pData;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ vecA0 = vldrbq_s8(pInA0);
+ vecA1 = vldrbq_s8(pInA1);
+ vecA2 = vldrbq_s8(pInA2);
+ vecA3 = vldrbq_s8(pInA3);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+ acc3 = vmladavq_s8(vecA3, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+ acc3 = vmladavq_s8(vecA3, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+ acc3 = vmladavq_s8(vecA3, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+ pOut++;
+
+ /* move to next B column */
+ pInB = pInB + 1;
+
+ vecB = vldrbq_gather_offset_z(pInB, vecColBOffs, p0);
+
+ acc0 = vmladavq_s8(vecA0, vecB);
+ acc1 = vmladavq_s8(vecA1, vecB);
+ acc2 = vmladavq_s8(vecA2, vecB);
+ acc3 = vmladavq_s8(vecA3, vecB);
+
+ pOut[0 * MATRIX_DIM] = (q7_t) __SSAT(acc0 >> 7, 8);
+ pOut[1 * MATRIX_DIM] = (q7_t) __SSAT(acc1 >> 7, 8);
+ pOut[2 * MATRIX_DIM] = (q7_t) __SSAT(acc2 >> 7, 8);
+ pOut[3 * MATRIX_DIM] = (q7_t) __SSAT(acc3 >> 7, 8);
+ /*
+ * Return to application
+ */
+ return (ARM_MATH_SUCCESS);
+}
+
+arm_status arm_mat_mult_q7(
+ const arm_matrix_instance_q7 * pSrcA,
+ const arm_matrix_instance_q7 * pSrcB,
+ arm_matrix_instance_q7 * pDst,
+ q7_t * pState)
+{
+ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
+ q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
+ q7_t *pInA2;
+ q7_t *pInB2;
+ q7_t *px; /* Temporary output data matrix pointer */
+ q7_t *px2; /* Temporary output data matrix pointer */
+ uint32_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint32_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint32_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint32_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ uint32_t col, i = 0u, j, row = numRowsB; /* loop counters */
+ q7_t *pSrcBT = pState; /* input data matrix pointer for transpose */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ arm_matrix_instance_q7 BT;
+
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* small squared matrix specialized routines */
+ if(numRowsA == numColsB && numColsB == numColsA) {
+ if(numRowsA == 2)
+ return arm_mat_mult_q7_2x2_mve(pSrcA, pSrcB, pDst);
+ else if(numRowsA == 3)
+ return arm_mat_mult_q7_3x3_mve(pSrcA, pSrcB, pDst);
+ else if (numRowsA == 4)
+ return arm_mat_mult_q7_4x4_mve(pSrcA, pSrcB, pDst);
+ }
+ /*
+ * Matrix transpose
+ */
+
+ BT.numRows = numColsB;
+ BT.numCols = numRowsB;
+ BT.pData = pSrcBT;
+
+ arm_mat_trans_q7(pSrcB, &BT);
+
+ /*
+ * Reset the variables for the usage in the following multiplication process
+ */
+ i = 0;
+ row = numRowsA >> 1;
+ px = pDst->pData;
+ px2 = px + numColsB;
+
+ /*
+ * The following loop performs the dot-product of each row in pSrcA with each column in pSrcB
+ */
+
+ /*
+ * row loop
+ */
+ while (row > 0u)
+ {
+ /*
+ * For every row wise process, the column loop counter is to be initiated
+ */
+ col = numColsB >> 1;
+ /*
+ * For every row wise process, the pIn2 pointer is set
+ * to the starting address of the transposed pSrcB data
+ */
+ pInB = pSrcBT;
+ pInB2 = pInB + numRowsB;
+ j = 0;
+
+ /*
+ * column loop
+ */
+ while (col > 0u)
+ {
+ q7_t const *pSrcAVec, *pSrcBVec, *pSrcA2Vec, *pSrcB2Vec;
+ q7x16_t vecA, vecA2, vecB, vecB2;
+ q31_t acc0, acc1, acc2, acc3;
+
+ /*
+ * Initiate the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pInA = pSrcA->pData + i;
+ pInA2 = pInA + numColsA;
+ pInB = pSrcBT + j;
+ pInB2 = pInB + numRowsB;
+
+ pSrcAVec = (q7_t const *) pInA;
+ pSrcA2Vec = (q7_t const *)pInA2;
+ pSrcBVec = (q7_t const *) pInB;
+ pSrcB2Vec = (q7_t const *)pInB2;
+
+ acc0 = 0L;
+ acc1 = 0L;
+ acc2 = 0L;
+ acc3 = 0L;
+
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 16;
+
+ blkCnt = numColsA >> 4;
+ while (blkCnt > 0U)
+ {
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 16;
+ acc0 = vmladavaq_s8(acc0, vecA, vecB);
+ vecA2 = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 16;
+ acc1 = vmladavaq_s8(acc1, vecA2, vecB);
+ vecB2 = vld1q(pSrcB2Vec);
+ pSrcB2Vec += 16;
+ acc2 = vmladavaq_s8(acc2, vecA, vecB2);
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 16;
+ acc3 = vmladavaq_s8(acc3, vecA2, vecB2);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numColsA & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+ vecA2 = vld1q(pSrcA2Vec);
+ acc1 = vmladavaq_p_s8(acc1, vecA2, vecB, p0);
+ vecB2 = vld1q(pSrcB2Vec);
+ acc2 = vmladavaq_p_s8(acc2, vecA, vecB2, p0);
+ vecA = vld1q(pSrcAVec);
+ acc3 = vmladavaq_p_s8(acc3, vecA2, vecB2, p0);
+ }
+
+ *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+ *px++ = (q7_t) __SSAT(acc2 >> 7, 8);
+ *px2++ = (q7_t) __SSAT(acc1 >> 7, 8);
+ *px2++ = (q7_t) __SSAT(acc3 >> 7, 8);
+ j += numRowsB * 2;
+ /*
+ * Decrement the column loop counter
+ */
+ col--;
+
+ }
+
+ i = i + numColsA * 2;
+ px = px2 + (numColsB & 1u);
+ px2 = px + numColsB;
+ /*
+ * Decrement the row loop counter
+ */
+ row--;
+ }
+
+ /*
+ * Compute remaining row and/or column below
+ */
+
+ if (numColsB & 1u)
+ {
+ row = numRowsA & (~0x1); //avoid redundant computation
+ px = pDst->pData + numColsB - 1;
+ i = 0;
+
+ /*
+ * row loop
+ */
+ while (row > 0)
+ {
+ q7_t const *pSrcAVec, *pSrcBVec;
+ q7x16_t vecA, vecB;
+ q63_t acc0;
+
+ /*
+ * point to last column in matrix B
+ */
+ pInB = pSrcBT + numRowsB * (numColsB - 1);
+ pInA = pSrcA->pData + i;
+
+ pSrcAVec = (q7_t const *) pInA;
+ pSrcBVec = (q7_t const *) pInB;
+
+ acc0 = 0LL;
+ blkCnt = (numColsA) >> 4;
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 16;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 16;
+ acc0 = vmladavaq_s8(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numColsA & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+ }
+
+ *px = (q7_t) __SSAT(acc0 >> 7, 8);
+
+ px += numColsB;
+
+ i += numColsA;
+ /*
+ * Decrement the row loop counter
+ */
+ row--;
+ }
+ }
+
+ if (numRowsA & 1u)
+ {
+ col = numColsB;
+ i = 0u;
+ /*
+ * point to last row in output matrix
+ */
+ px = pDst->pData + (numColsB) * (numRowsA - 1);
+ /*
+ * col loop
+ */
+ while (col > 0)
+ {
+ q7_t const *pSrcAVec, *pSrcBVec;
+ q7x16_t vecA, vecB;
+ q63_t acc0;
+
+ /*
+ * point to last row in matrix A
+ */
+ pInA = pSrcA->pData + (numRowsA - 1) * numColsA;
+ pInB = pSrcBT + i;
+
+ /*
+ * Set the variable sum, that acts as accumulator, to zero
+ */
+ pSrcAVec = (q7_t const *) pInA;
+ pSrcBVec = (q7_t const *) pInB;
+ acc0 = 0LL;
+
+ blkCnt = (numColsA) >> 4;
+ while (blkCnt > 0U)
+ {
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 16;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 16;
+ acc0 = vmladavaq_s8(acc0, vecA, vecB);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numColsA & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ acc0 = vmladavaq_p_s8(acc0, vecA, vecB, p0);
+ }
+
+ *px++ = (q7_t) __SSAT(acc0 >> 7, 8);
+
+ i += numColsA;
+
+ /*
+ * Decrement the col loop counter
+ */
+ col--;
+ }
+ }
+ /*
+ * Return to application
+ */
+ status = ARM_MATH_SUCCESS;
+ }
+ return(status);
+}
+#else
+arm_status arm_mat_mult_q7(const arm_matrix_instance_q7 *pSrcA, const arm_matrix_instance_q7 *pSrcB, arm_matrix_instance_q7 *pDst, q7_t *pState)
+{
+ q31_t sum; /* accumulator */
+ q7_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q7_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q7_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q7 type */
+ q7_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q7 type */
+ q7_t *pOut = pDst->pData; /* output data matrix pointer */
+ q7_t *px; /* Temporary output data matrix pointer */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t col, i = 0U, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+ (void)pState;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcB->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
+ pIn1 = pInA;
+
+ /* Matrix A columns number of MAC operations are to be performed */
+ colCnt = numColsA;
+
+ /* matrix multiplication */
+ while (colCnt > 0U) {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ sum += (q31_t)*pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
+ /* Saturate and store the result in the destination buffer */
+ *px++ = (q7_t)__SSAT((sum >> 7), 8);
+
+ /* Decrement the column loop counter */
+ col--;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while (col > 0U);
+
+ /* Update the pointer pSrcA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c
new file mode 100644
index 0000000000..1e29715a1a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f16.c
@@ -0,0 +1,208 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_scale_f16.c
+ * Description: Multiplies a floating-point matrix by a scalar
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixScale
+ @{
+ */
+
+/**
+ @brief Floating-point matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scale scale factor to be applied
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_scale_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ float16_t scale,
+ arm_matrix_instance_f16 * pDst)
+{
+ arm_status status; /* status of matrix scaling */
+ #ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ float16_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ f16x8_t vecIn, vecOut, vecScale;
+ float16_t const *pInVec;
+
+ pInVec = (float16_t const *) pIn;
+
+ vecScale = vdupq_n_f16(scale);
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /*
+ * C(m,n) = A(m,n) * scale
+ * Scaling and results are stored in the destination buffer.
+ */
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+
+ vecOut = vmulq_f16(vecIn, vecScale);
+
+ vst1q(pOut, vecOut);
+ pOut += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecIn = vld1q(pInVec);
+ vecOut = vecIn * scale;
+
+ vstrhq_p(pOut, vecOut, p0);
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+#else
+
+arm_status arm_mat_scale_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ float16_t scale,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counters */
+ arm_status status; /* Status of matrix scaling */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixScale group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
new file mode 100644
index 0000000000..f9649b6df6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_f32.c
@@ -0,0 +1,287 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_scale_f32.c
+ * Description: Multiplies a floating-point matrix by a scalar
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixScale Matrix Scale
+
+ Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
+ matrix by the scalar. For example:
+ \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
+
+ The function checks to make sure that the input and output matrices are of the same size.
+
+ In the fixed-point Q15 and Q31 functions, scale is represented by
+ a fractional multiplication scaleFract and an arithmetic shift shift.
+ The shift allows the gain of the scaling operation to exceed 1.0.
+ The overall scale factor applied to the fixed-point data is
+
+ scale = scaleFract * 2^shift.
+
+ */
+
+/**
+ @addtogroup MatrixScale
+ @{
+ */
+
+/**
+ @brief Floating-point matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scale scale factor to be applied
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_scale_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ float32_t scale,
+ arm_matrix_instance_f32 * pDst)
+{
+ arm_status status; /* status of matrix scaling */
+ #ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ f32x4_t vecIn, vecOut;
+ float32_t const *pInVec;
+
+ pInVec = (float32_t const *) pIn;
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /*
+ * C(m,n) = A(m,n) * scale
+ * Scaling and results are stored in the destination buffer.
+ */
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+
+ vecOut = vecIn * scale;
+
+ vst1q(pOut, vecOut);
+ pOut += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecIn = vld1q(pInVec);
+ vecOut = vecIn * scale;
+
+ vstrwq_p(pOut, vecOut, p0);
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+arm_status arm_mat_scale_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ float32_t scale,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix scaling */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ float32x4_t vec1;
+ float32x4_t res;
+
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+ blkCnt = numSamples >> 2;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+ /* Scaling and results are stored in the destination buffer. */
+ vec1 = vld1q_f32(pIn);
+ res = vmulq_f32(vec1, vdupq_n_f32(scale));
+ vst1q_f32(pOut, res);
+
+ /* update pointers to process next sampels */
+ pIn += 4U;
+ pOut += 4U;
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+ /* The results are stored in the destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_scale_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ float32_t scale,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counters */
+ arm_status status; /* Status of matrix scaling */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * scale */
+
+ /* Scale and store result in destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixScale group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
new file mode 100644
index 0000000000..5520dd9fd1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q15.c
@@ -0,0 +1,249 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_scale_q15.c
+ * Description: Multiplies a Q15 matrix by a scalar
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixScale
+ @{
+ */
+
+/**
+ @brief Q15 matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scaleFract fractional portion of the scale factor
+ @param[in] shift number of bits to shift the result by
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The input data *pSrc and scaleFract are in 1.15 format.
+ These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_scale_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ q15_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q15 * pDst)
+{
+ arm_status status; /* Status of matrix scaling */
+ q15_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecIn, vecOut;
+ q15_t const *pInVec;
+ int32_t totShift = shift + 1; /* shift to apply after scaling */
+
+ pInVec = (q15_t const *) pIn;
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /*
+ * C(m,n) = A(m,n) * scale
+ * Scaling and results are stored in the destination buffer.
+ */
+ vecIn = vld1q(pInVec); pInVec += 8;
+
+ /* multiply input with scaler value */
+ vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
+ /* apply shifting */
+ vecOut = vqshlq_r(vecOut, totShift);
+
+ vst1q(pOut, vecOut); pOut += 8;
+
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecIn = vld1q(pInVec); pInVec += 8;
+ vecOut = vmulhq(vecIn, vdupq_n_s16(scaleFract));
+ vecOut = vqshlq_r(vecOut, totShift);
+ vstrhq_p(pOut, vecOut, p0);
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_scale_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ q15_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counter */
+ arm_status status; /* Status of matrix scaling */
+ int32_t kShift = 15 - shift; /* Total shift to apply after scaling */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t inA1, inA2;
+ q31_t out1, out2, out3, out4; /* Temporary output variables */
+ q15_t in1, in2, in3, in4; /* Temporary input variables */
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * k */
+
+#if defined (ARM_MATH_DSP)
+ /* read 2 times 2 samples at a time from source */
+ inA1 = read_q15x2_ia ((q15_t **) &pIn);
+ inA2 = read_q15x2_ia ((q15_t **) &pIn);
+
+ /* Scale inputs and store result in temporary variables
+ * in single cycle by packing the outputs */
+ out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
+ out2 = (q31_t) ((q15_t) (inA1 ) * scaleFract);
+ out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
+ out4 = (q31_t) ((q15_t) (inA2 ) * scaleFract);
+
+ /* apply shifting */
+ out1 = out1 >> kShift;
+ out2 = out2 >> kShift;
+ out3 = out3 >> kShift;
+ out4 = out4 >> kShift;
+
+ /* saturate the output */
+ in1 = (q15_t) (__SSAT(out1, 16));
+ in2 = (q15_t) (__SSAT(out2, 16));
+ in3 = (q15_t) (__SSAT(out3, 16));
+ in4 = (q15_t) (__SSAT(out4, 16));
+
+ /* store result to destination */
+ write_q15x2_ia (&pOut, __PKHBT(in2, in1, 16));
+ write_q15x2_ia (&pOut, __PKHBT(in4, in3, 16));
+
+#else
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * k */
+
+ /* Scale, saturate and store result in destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> kShift, 16));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixScale group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
new file mode 100644
index 0000000000..0344a47289
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_scale_q31.c
@@ -0,0 +1,242 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_scale_q31.c
+ * Description: Multiplies a Q31 matrix by a scalar
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixScale
+ @{
+ */
+
+/**
+ @brief Q31 matrix scaling.
+ @param[in] pSrc points to input matrix
+ @param[in] scaleFract fractional portion of the scale factor
+ @param[in] shift number of bits to shift the result by
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The input data *pSrc and scaleFract are in 1.31 format.
+ These are multiplied to yield a 2.62 intermediate result which is shifted with saturation to 1.31 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_scale_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ q31_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecIn, vecOut;
+ q31_t const *pInVec;
+ int32_t totShift = shift + 1; /* shift to apply after scaling */
+ arm_status status; /* Status of matrix scaling */
+
+ pInVec = (q31_t const *) pIn;
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /*
+ * C(m,n) = A(m,n) * scale
+ * Scaling and results are stored in the destination buffer.
+ */
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ /* multiply input with scaler value */
+ vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
+ /* apply shifting */
+ vecOut = vqshlq_r(vecOut, totShift);
+
+ vst1q(pOut, vecOut);
+ pOut += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecOut = vmulhq(vecIn, vdupq_n_s32(scaleFract));
+ vecOut = vqshlq_r(vecOut, totShift);
+ vstrwq_p(pOut, vecOut, p0);
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_scale_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ q31_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* Input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* Output data matrix pointer */
+ uint32_t numSamples; /* Total number of elements in the matrix */
+ uint32_t blkCnt; /* Loop counter */
+ arm_status status; /* Status of matrix scaling */
+ int32_t kShift = shift + 1; /* Shift to apply after scaling */
+ q31_t in, out; /* Temporary variabels */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numRows) ||
+ (pSrc->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * k */
+
+ /* Scale, saturate and store result in destination buffer. */
+ in = *pIn++; /* read four inputs from source */
+ in = ((q63_t) in * scaleFract) >> 32; /* multiply input with scaler value */
+ out = in << kShift; /* apply shifting */
+ if (in != (out >> kShift)) /* saturate the results. */
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out; /* Store result destination */
+
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
+
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
+
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) * k */
+
+ /* Scale, saturate and store result in destination buffer. */
+ in = *pIn++;
+ in = ((q63_t) in * scaleFract) >> 32;
+ out = in << kShift;
+ if (in != (out >> kShift))
+ out = 0x7FFFFFFF ^ (in >> 31);
+ *pOut++ = out;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixScale group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c
new file mode 100644
index 0000000000..f5755c0c2e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f16.c
@@ -0,0 +1,234 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_lower_triangular_f16.c
+ * Description: Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+
+ /**
+ * @brief Solve LT . X = A where LT is a lower triangular matrix
+ * @param[in] lt The lower triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of LT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+ arm_status arm_mat_solve_lower_triangular_f16(
+ const arm_matrix_instance_f16 * lt,
+ const arm_matrix_instance_f16 * a,
+ arm_matrix_instance_f16 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float16_t *pX = dst->pData;
+ float16_t *pLT = lt->pData;
+ float16_t *pA = a->pData;
+
+ float16_t *lt_row;
+ float16_t *a_col;
+
+ _Float16 invLT;
+
+ f16x8_t vecA;
+ f16x8_t vecX;
+
+ for(i=0; i < n ; i++)
+ {
+
+ for(j=0; j+7 < n; j += 8)
+ {
+ vecA = vld1q_f16(&pA[i * n + j]);
+
+ for(k=0; k < i; k++)
+ {
+ vecX = vld1q_f16(&pX[n*k+j]);
+ vecA = vfmsq(vecA,vdupq_n_f16(pLT[n*i + k]),vecX);
+ }
+
+ if (pLT[n*i + i]==0.0f16)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invLT = 1.0f16 / (_Float16)pLT[n*i + i];
+ vecA = vmulq(vecA,vdupq_n_f16(invLT));
+ vst1q(&pX[i*n+j],vecA);
+
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+ lt_row = &pLT[n*i];
+
+ _Float16 tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= (_Float16)lt_row[k] * (_Float16)pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f16)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / (_Float16)lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+ arm_status arm_mat_solve_lower_triangular_f16(
+ const arm_matrix_instance_f16 * lt,
+ const arm_matrix_instance_f16 * a,
+ arm_matrix_instance_f16 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float16_t *pX = dst->pData;
+ float16_t *pLT = lt->pData;
+ float16_t *pA = a->pData;
+
+ float16_t *lt_row;
+ float16_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=0; i < n ; i++)
+ {
+ lt_row = &pLT[n*i];
+
+ float16_t tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c
new file mode 100644
index 0000000000..fddd657641
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f32.c
@@ -0,0 +1,332 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_lower_triangular_f32.c
+ * Description: Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+
+ /**
+ * @brief Solve LT . X = A where LT is a lower triangular matrix
+ * @param[in] lt The lower triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of LT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+ arm_status arm_mat_solve_lower_triangular_f32(
+ const arm_matrix_instance_f32 * lt,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pLT = lt->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *lt_row;
+ float32_t *a_col;
+
+ float32_t invLT;
+
+ f32x4_t vecA;
+ f32x4_t vecX;
+
+ for(i=0; i < n ; i++)
+ {
+
+ for(j=0; j+3 < n; j += 4)
+ {
+ vecA = vld1q_f32(&pA[i * n + j]);
+
+ for(k=0; k < i; k++)
+ {
+ vecX = vld1q_f32(&pX[n*k+j]);
+ vecA = vfmsq(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
+ }
+
+ if (pLT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invLT = 1.0f / pLT[n*i + i];
+ vecA = vmulq(vecA,vdupq_n_f32(invLT));
+ vst1q(&pX[i*n+j],vecA);
+
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+ lt_row = &pLT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+ arm_status arm_mat_solve_lower_triangular_f32(
+ const arm_matrix_instance_f32 * lt,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pLT = lt->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *lt_row;
+ float32_t *a_col;
+
+ float32_t invLT;
+
+ f32x4_t vecA;
+ f32x4_t vecX;
+
+ for(i=0; i < n ; i++)
+ {
+
+ for(j=0; j+3 < n; j += 4)
+ {
+ vecA = vld1q_f32(&pA[i * n + j]);
+
+ for(k=0; k < i; k++)
+ {
+ vecX = vld1q_f32(&pX[n*k+j]);
+ vecA = vfmsq_f32(vecA,vdupq_n_f32(pLT[n*i + k]),vecX);
+ }
+
+ if (pLT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invLT = 1.0f / pLT[n*i + i];
+ vecA = vmulq_f32(vecA,vdupq_n_f32(invLT));
+ vst1q_f32(&pX[i*n+j],vecA);
+
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+ lt_row = &pLT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+ arm_status arm_mat_solve_lower_triangular_f32(
+ const arm_matrix_instance_f32 * lt,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pLT = lt->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *lt_row;
+ float32_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=0; i < n ; i++)
+ {
+ lt_row = &pLT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
new file mode 100644
index 0000000000..cb80137066
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_lower_triangular_f64.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_lower_triangular_f64.c
+ * Description: Solve linear system LT X = A with LT lower triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+
+ /**
+ * @brief Solve LT . X = A where LT is a lower triangular matrix
+ * @param[in] lt The lower triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of LT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+ arm_status arm_mat_solve_lower_triangular_f64(
+ const arm_matrix_instance_f64 * lt,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+ {
+ arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((lt->numRows != lt->numCols) ||
+ (a->numRows != a->numCols) ||
+ (lt->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* a1 b1 c1 x1 = a1
+ b2 c2 x2 a2
+ c3 x3 a3
+
+ x3 = a3 / c3
+ x2 = (a2 - c2 x3) / b2
+
+ */
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float64_t *pX = dst->pData;
+ float64_t *pLT = lt->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *lt_row;
+ float64_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=0; i < n ; i++)
+ {
+ lt_row = &pLT[n*i];
+
+ float64_t tmp=a_col[i * n];
+
+ for(k=0; k < i; k++)
+ {
+ tmp -= lt_row[k] * pX[n*k+j];
+ }
+
+ if (lt_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / lt_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c
new file mode 100644
index 0000000000..d3852256b6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f16.c
@@ -0,0 +1,226 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_upper_triangular_f16.c
+ * Description: Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ * @brief Solve UT . X = A where UT is an upper triangular matrix
+ * @param[in] ut The upper triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of UT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+ arm_status arm_mat_solve_upper_triangular_f16(
+ const arm_matrix_instance_f16 * ut,
+ const arm_matrix_instance_f16 * a,
+ arm_matrix_instance_f16 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float16_t *pX = dst->pData;
+ float16_t *pUT = ut->pData;
+ float16_t *pA = a->pData;
+
+ float16_t *ut_row;
+ float16_t *a_col;
+
+ _Float16 invUT;
+
+ f16x8_t vecA;
+ f16x8_t vecX;
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ for(j=0; j+7 < n; j +=8)
+ {
+ vecA = vld1q_f16(&pA[i * n + j]);
+
+ for(k=n-1; k > i; k--)
+ {
+ vecX = vld1q_f16(&pX[n*k+j]);
+ vecA = vfmsq(vecA,vdupq_n_f16(pUT[n*i + k]),vecX);
+ }
+
+ if (pUT[n*i + i]==0.0f16)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invUT = 1.0f16 / (_Float16)pUT[n*i + i];
+ vecA = vmulq(vecA,vdupq_n_f16(invUT));
+
+
+ vst1q(&pX[i*n+j],vecA);
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ ut_row = &pUT[n*i];
+
+ _Float16 tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= (_Float16)ut_row[k] * (_Float16)pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f16)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / (_Float16)ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+ arm_status arm_mat_solve_upper_triangular_f16(
+ const arm_matrix_instance_f16 * ut,
+ const arm_matrix_instance_f16 * a,
+ arm_matrix_instance_f16 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float16_t *pX = dst->pData;
+ float16_t *pUT = ut->pData;
+ float16_t *pA = a->pData;
+
+ float16_t *ut_row;
+ float16_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ ut_row = &pUT[n*i];
+
+ float16_t tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c
new file mode 100644
index 0000000000..ce55ea3a7a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f32.c
@@ -0,0 +1,319 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_upper_triangular_f32.c
+ * Description: Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ * @brief Solve UT . X = A where UT is an upper triangular matrix
+ * @param[in] ut The upper triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of UT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+ arm_status arm_mat_solve_upper_triangular_f32(
+ const arm_matrix_instance_f32 * ut,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pUT = ut->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *ut_row;
+ float32_t *a_col;
+
+ float32_t invUT;
+
+ f32x4_t vecA;
+ f32x4_t vecX;
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ for(j=0; j+3 < n; j +=4)
+ {
+ vecA = vld1q_f32(&pA[i * n + j]);
+
+ for(k=n-1; k > i; k--)
+ {
+ vecX = vld1q_f32(&pX[n*k+j]);
+ vecA = vfmsq(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
+ }
+
+ if (pUT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invUT = 1.0f / pUT[n*i + i];
+ vecA = vmulq(vecA,vdupq_n_f32(invUT));
+
+
+ vst1q(&pX[i*n+j],vecA);
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ ut_row = &pUT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+ arm_status arm_mat_solve_upper_triangular_f32(
+ const arm_matrix_instance_f32 * ut,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pUT = ut->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *ut_row;
+ float32_t *a_col;
+
+ float32_t invUT;
+
+ f32x4_t vecA;
+ f32x4_t vecX;
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ for(j=0; j+3 < n; j +=4)
+ {
+ vecA = vld1q_f32(&pA[i * n + j]);
+
+ for(k=n-1; k > i; k--)
+ {
+ vecX = vld1q_f32(&pX[n*k+j]);
+ vecA = vfmsq_f32(vecA,vdupq_n_f32(pUT[n*i + k]),vecX);
+ }
+
+ if (pUT[n*i + i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+
+ invUT = 1.0f / pUT[n*i + i];
+ vecA = vmulq_f32(vecA,vdupq_n_f32(invUT));
+
+
+ vst1q_f32(&pX[i*n+j],vecA);
+ }
+
+ for(; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ ut_row = &pUT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+ arm_status arm_mat_solve_upper_triangular_f32(
+ const arm_matrix_instance_f32 * ut,
+ const arm_matrix_instance_f32 * a,
+ arm_matrix_instance_f32 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float32_t *pX = dst->pData;
+ float32_t *pUT = ut->pData;
+ float32_t *pA = a->pData;
+
+ float32_t *ut_row;
+ float32_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ ut_row = &pUT[n*i];
+
+ float32_t tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
new file mode 100644
index 0000000000..4525d7ee7c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_solve_upper_triangular_f64.c
@@ -0,0 +1,120 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_solve_upper_triangular_f64.c
+ * Description: Solve linear system UT X = A with UT upper triangular matrix
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixInv
+ @{
+ */
+
+/**
+ * @brief Solve UT . X = A where UT is an upper triangular matrix
+ * @param[in] ut The upper triangular matrix
+ * @param[in] a The matrix a
+ * @param[out] dst The solution X of UT . X = A
+ * @return The function returns ARM_MATH_SINGULAR, if the system can't be solved.
+ */
+ arm_status arm_mat_solve_upper_triangular_f64(
+ const arm_matrix_instance_f64 * ut,
+ const arm_matrix_instance_f64 * a,
+ arm_matrix_instance_f64 * dst)
+ {
+arm_status status; /* status of matrix inverse */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((ut->numRows != ut->numCols) ||
+ (a->numRows != a->numCols) ||
+ (ut->numRows != a->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ int i,j,k,n;
+
+ n = dst->numRows;
+
+ float64_t *pX = dst->pData;
+ float64_t *pUT = ut->pData;
+ float64_t *pA = a->pData;
+
+ float64_t *ut_row;
+ float64_t *a_col;
+
+ for(j=0; j < n; j ++)
+ {
+ a_col = &pA[j];
+
+ for(i=n-1; i >= 0 ; i--)
+ {
+ ut_row = &pUT[n*i];
+
+ float64_t tmp=a_col[i * n];
+
+ for(k=n-1; k > i; k--)
+ {
+ tmp -= ut_row[k] * pX[n*k+j];
+ }
+
+ if (ut_row[i]==0.0f)
+ {
+ return(ARM_MATH_SINGULAR);
+ }
+ tmp = tmp / ut_row[i];
+ pX[i*n+j] = tmp;
+ }
+
+ }
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+
+ /* Return to application */
+ return (status);
+}
+
+
+/**
+ @} end of MatrixInv group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c
new file mode 100644
index 0000000000..fee53c728e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f16.c
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_sub_f16.c
+ * Description: Floating-point matrix subtraction
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+
+/**
+ @addtogroup MatrixSub
+ @{
+ */
+
+/**
+ @brief Floating-point matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_sub_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ arm_status status; /* status of matrix subtraction */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ float16_t *pDataA, *pDataB, *pDataDst;
+ f16x8_t vecA, vecB, vecDst;
+ float16_t const *pSrcAVec;
+ float16_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (float16_t const *) pDataA;
+ pSrcBVec = (float16_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* sub and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 8;
+ vecDst = vsubq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ vecDst = vsubq_m(vecDst, vecA, vecB, p0);
+ vstrhq_p(pDataDst, vecDst, p0);
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+
+arm_status arm_mat_sub_f16(
+ const arm_matrix_instance_f16 * pSrcA,
+ const arm_matrix_instance_f16 * pSrcB,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float16_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixSub group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
new file mode 100644
index 0000000000..22c257c42e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f32.c
@@ -0,0 +1,298 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_sub_f32.c
+ * Description: Floating-point matrix subtraction
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixSub Matrix Subtraction
+
+ Subtract two matrices.
+ \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
+
+ The functions check to make sure that
+ pSrcA, pSrcB, and pDst have the same
+ number of rows and columns.
+ */
+
+/**
+ @addtogroup MatrixSub
+ @{
+ */
+
+/**
+ @brief Floating-point matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_sub_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ arm_status status; /* status of matrix subtraction */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ float32_t *pDataA, *pDataB, *pDataDst;
+ f32x4_t vecA, vecB, vecDst;
+ float32_t const *pSrcAVec;
+ float32_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (float32_t const *) pDataA;
+ pSrcBVec = (float32_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* sub and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vsubq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ vecB = vld1q(pSrcBVec);
+ vecDst = vsubq_m(vecDst, vecA, vecB, p0);
+ vstrwq_p(pDataDst, vecDst, p0);
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+arm_status arm_mat_sub_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ float32x4_t vec1;
+ float32x4_t vec2;
+ float32x4_t res;
+
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+ blkCnt = numSamples >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ /* Read values from source A */
+ vec1 = vld1q_f32(pIn1);
+ vec2 = vld1q_f32(pIn2);
+ res = vsubq_f32(vec1, vec2);
+ vst1q_f32(pOut, res);
+
+ /* Update pointers to process next samples */
+ pIn1 += 4U;
+ pIn2 += 4U;
+ pOut += 4U;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4U;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ *pOut++ = (*pIn1++) - (*pIn2++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_sub_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of MatrixSub group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c
new file mode 100644
index 0000000000..245a76d075
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_f64.c
@@ -0,0 +1,143 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_sub_f64.c
+ * Description: Floating-point matrix subtraction
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixSub Matrix Subtraction
+
+ Subtract two matrices.
+ \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
+
+ The functions check to make sure that
+ pSrcA, pSrcB, and pDst have the same
+ number of rows and columns.
+ */
+
+/**
+ @addtogroup MatrixSub
+ @{
+ */
+
+/**
+ @brief Floating-point matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+arm_status arm_mat_sub_f64(
+ const arm_matrix_instance_f64 * pSrcA,
+ const arm_matrix_instance_f64 * pSrcB,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float64_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint64_t numSamples; /* total number of elements in the matrix */
+ uint64_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint64_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+ *pOut++ = (*pInA++) - (*pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ @} end of MatrixSub group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
new file mode 100644
index 0000000000..c9f9c83059
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q15.c
@@ -0,0 +1,219 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_sub_q15.c
+ * Description: Q15 Matrix subtraction
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixSub
+ @{
+ */
+
+/**
+ @brief Q15 matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+arm_status arm_mat_sub_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ uint32_t numSamples; /* total number of elements in the matrix */
+ q15_t *pDataA, *pDataB, *pDataDst;
+ q15x8_t vecA, vecB, vecDst;
+ q15_t const *pSrcAVec;
+ q15_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (q15_t const *) pDataA;
+ pSrcBVec = (q15_t const *) pDataB;
+
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* sub and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec); pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec); pSrcBVec += 8;
+ vecDst = vqsubq(vecA, vecB);
+ vst1q(pDataDst, vecDst); pDataDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecA = vld1q(pSrcAVec); pSrcAVec += 8;
+ vecB = vld1q(pSrcBVec); pSrcBVec += 8;
+ vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
+ vstrhq_p(pDataDst, vecDst, p0);
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_sub_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract, Saturate and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
+ write_q15x2_ia (&pOut, __QSUB16(read_q15x2_ia ((q15_t **) &pInA), read_q15x2_ia ((q15_t **) &pInB)));
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract and store result in destination buffer. */
+#if defined (ARM_MATH_DSP)
+ *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
+#else
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+#endif
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixSub group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
new file mode 100644
index 0000000000..a788d68ee3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_sub_q31.c
@@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_sub_q31.c
+ * Description: Q31 matrix subtraction
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixSub
+ @{
+ */
+
+/**
+ @brief Q31 matrix subtraction.
+ @param[in] pSrcA points to the first input matrix structure
+ @param[in] pSrcB points to the second input matrix structure
+ @param[out] pDst points to output matrix structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] are saturated.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_sub_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ uint32_t numSamples; /* total number of elements in the matrix */
+ q31_t *pDataA, *pDataB, *pDataDst;
+ q31x4_t vecA, vecB, vecDst;
+ q31_t const *pSrcAVec;
+ q31_t const *pSrcBVec;
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+ pDataA = pSrcA->pData;
+ pDataB = pSrcB->pData;
+ pDataDst = pDst->pData;
+ pSrcAVec = (q31_t const *) pDataA;
+ pSrcBVec = (q31_t const *) pDataB;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /*
+ * Total number of samples in the input matrix
+ */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+ blkCnt = numSamples >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* sub and then store the results in the destination buffer. */
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vqsubq(vecA, vecB);
+ vst1q(pDataDst, vecDst);
+ pDataDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = numSamples & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ vecA = vld1q(pSrcAVec);
+ pSrcAVec += 4;
+ vecB = vld1q(pSrcBVec);
+ pSrcBVec += 4;
+ vecDst = vqsubq_m(vecDst, vecA, vecB, p0);
+ vstrwq_p(pDataDst, vecDst, p0);
+ }
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+arm_status arm_mat_sub_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) ||
+ (pSrcA->numCols != pDst->numCols) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = numSamples >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract, saturate and then store the results in the destination buffer. */
+ *pOut++ = __QSUB(*pInA++, *pInB++);
+
+ *pOut++ = __QSUB(*pInA++, *pInB++);
+
+ *pOut++ = __QSUB(*pInA++, *pInB++);
+
+ *pOut++ = __QSUB(*pInA++, *pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = numSamples % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+
+ /* Subtract, saturate and store result in destination buffer. */
+ *pOut++ = __QSUB(*pInA++, *pInB++);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixSub group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c
new file mode 100644
index 0000000000..04a7c59cf8
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f16.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_f16.c
+ * Description: Floating-point matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_trans_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ if (pDst->numRows == pDst->numCols)
+ {
+ if (pDst->numCols == 1)
+ {
+ pDst->pData[0] = pSrc->pData[0];
+ return(ARM_MATH_SUCCESS);
+ }
+ if (pDst->numCols == 2)
+ return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ if (pDst->numCols == 3)
+ return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ if (pDst->numCols == 4)
+ return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ }
+
+ arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+
+arm_status arm_mat_trans_f16(
+ const arm_matrix_instance_f16 * pSrc,
+ arm_matrix_instance_f16 * pDst)
+{
+ float16_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float16_t *pOut = pDst->pData; /* output data matrix pointer */
+ float16_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
new file mode 100644
index 0000000000..cb24fe027d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f32.c
@@ -0,0 +1,325 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_f32.c
+ * Description: Floating-point matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixTrans Matrix Transpose
+
+ Tranposes a matrix.
+
+ Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ if (pDst->numRows == pDst->numCols)
+ {
+ if (pDst->numCols == 2)
+ return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ if (pDst->numCols == 3)
+ return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ if (pDst->numCols == 4)
+ return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ }
+
+ arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+arm_status arm_mat_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+
+ uint16_t blkCnt, rowCnt, i = 0U, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* Row loop */
+ rowCnt = row >> 2;
+ while (rowCnt > 0U)
+ {
+ float32x4_t row0V,row1V,row2V,row3V;
+ float32x4x2_t ra0,ra1,rb0,rb1;
+
+ blkCnt = nColumns >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U) /* Column loop */
+ {
+ row0V = vld1q_f32(pIn);
+ row1V = vld1q_f32(pIn + 1 * nColumns);
+ row2V = vld1q_f32(pIn + 2 * nColumns);
+ row3V = vld1q_f32(pIn + 3 * nColumns);
+ pIn += 4;
+
+ ra0 = vzipq_f32(row0V,row2V);
+ ra1 = vzipq_f32(row1V,row3V);
+
+ rb0 = vzipq_f32(ra0.val[0],ra1.val[0]);
+ rb1 = vzipq_f32(ra0.val[1],ra1.val[1]);
+
+ vst1q_f32(px,rb0.val[0]);
+ px += nRows;
+
+ vst1q_f32(px,rb0.val[1]);
+ px += nRows;
+
+ vst1q_f32(px,rb1.val[0]);
+ px += nRows;
+
+ vst1q_f32(px,rb1.val[1]);
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ blkCnt = nColumns % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px++ = *pIn;
+ *px++ = *(pIn + 1 * nColumns);
+ *px++ = *(pIn + 2 * nColumns);
+ *px++ = *(pIn + 3 * nColumns);
+
+ px += (nRows - 4);
+ pIn++;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ i += 4;
+ pIn += 3 * nColumns;
+
+ /* Decrement the row loop counter */
+ rowCnt--;
+
+ } /* Row loop end */
+
+ rowCnt = row & 3;
+ while (rowCnt > 0U)
+ {
+ blkCnt = nColumns ;
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ while (blkCnt > 0U)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+ i++;
+ rowCnt -- ;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c
new file mode 100644
index 0000000000..57b5043fc5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_f64.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_f64.c
+ * Description: Floating-point matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @defgroup MatrixTrans Matrix Transpose
+
+ Tranposes a matrix.
+
+ Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Floating-point matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+arm_status arm_mat_trans_f64(
+ const arm_matrix_instance_f64 * pSrc,
+ arm_matrix_instance_f64 * pDst)
+{
+ float64_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float64_t *pOut = pDst->pData; /* output data matrix pointer */
+ float64_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint64_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
new file mode 100644
index 0000000000..a29b1c4af6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q15.c
@@ -0,0 +1,233 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_q15.c
+ * Description: Q15 matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Q15 matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+
+arm_status arm_mat_trans_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ arm_matrix_instance_q15 * pDst)
+{
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ if (pDst->numRows == pDst->numCols)
+ {
+ if (pDst->numCols == 1)
+ {
+ pDst->pData[0] = pSrc->pData[0];
+ return(ARM_MATH_SUCCESS);
+ }
+ if (pDst->numCols == 2)
+ return arm_mat_trans_16bit_2x2((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ if (pDst->numCols == 3)
+ return arm_mat_trans_16bit_3x3_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ if (pDst->numCols == 4)
+ return arm_mat_trans_16bit_4x4_mve((uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ }
+
+ arm_mat_trans_16bit_generic(pSrc->numRows, pSrc->numCols, (uint16_t *)pSrc->pData, (uint16_t *)pDst->pData);
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_trans_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in; /* variable to hold temporary output */
+#endif
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer pOut is set to starting address of column being processed */
+ pOut = pDst->pData + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pIn);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) in;
+#else
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Unpack and store second element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *pOut = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Read two elements from row */
+ in = read_q15x2_ia ((q15_t **) &pIn);
+
+ /* Unpack and store one element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) in;
+#else
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Unpack and store second element in destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+#else
+ *pOut = (q15_t) in;
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *pOut = *pIn++;
+
+ /* Update pointer pOut to point to next row of transposed matrix */
+ pOut += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
new file mode 100644
index 0000000000..2c77254704
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q31.c
@@ -0,0 +1,191 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_q31.c
+ * Description: Q31 matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Q31 matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+arm_status arm_mat_trans_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ arm_matrix_instance_q31 * pDst)
+{
+ arm_status status; /* status of matrix transpose */
+ #ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ if (pDst->numRows == pDst->numCols)
+ {
+ if (pDst->numCols == 2)
+ return arm_mat_trans_32bit_2x2_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ if (pDst->numCols == 3)
+ return arm_mat_trans_32bit_3x3_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ if (pDst->numCols == 4)
+ return arm_mat_trans_32bit_4x4_mve((uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ }
+
+ arm_mat_trans_32bit_generic_mve(pSrc->numRows, pSrc->numCols, (uint32_t *)pSrc->pData, (uint32_t *)pDst->pData);
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#else
+arm_status arm_mat_trans_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nCols = pSrc->numCols; /* number of columns */
+ uint32_t col, row = nRows, i = 0U; /* Loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) ||
+ (pSrc->numCols != pDst->numRows) )
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Pointer px is set to starting address of column being processed */
+ px = pOut + i;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ col = nCols >> 2U;
+
+ while (col > 0U) /* column loop */
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ *px = *pIn++;
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ col = nCols % 0x4U;
+
+#else
+
+ /* Initialize col with number of samples */
+ col = nCols;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (col > 0U)
+ {
+ /* Read and store input element in destination */
+ *px = *pIn++;
+
+ /* Update pointer px to point to next row of transposed matrix */
+ px += nRows;
+
+ /* Decrement column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement row loop counter */
+ row--;
+
+ } while (row > 0U); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c
new file mode 100644
index 0000000000..766ca2a816
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_trans_q7.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_trans_q7.c
+ * Description: Q7 matrix transpose
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ @ingroup groupMatrix
+ */
+
+/**
+ @addtogroup MatrixTrans
+ @{
+ */
+
+/**
+ @brief Q7 matrix transpose.
+ @param[in] pSrc points to input matrix
+ @param[out] pDst points to output matrix
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+
+ uint16x8_t vecOffs;
+ uint32_t i;
+ uint32_t blkCnt;
+ uint8_t const *pDataC;
+ uint8_t *pDataDestR;
+ uint16x8_t vecIn;
+
+ const uint8_t * pDataSrc=(const uint8_t *)pSrc->pData;
+ uint8_t * pDataDst=(uint8_t *)pDst->pData;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ return ARM_MATH_SIZE_MISMATCH;
+ }
+#endif
+
+ vecOffs = vidupq_u16((uint32_t)0, 1);
+ vecOffs = vecOffs * pSrc->numCols;
+
+ i = pSrc->numCols;
+ do
+ {
+ pDataC = (uint8_t const *) pDataSrc;
+ pDataDestR = (uint8_t*)pDataDst;
+
+ blkCnt = pSrc->numRows >> 3;
+ while (blkCnt > 0U)
+ {
+ /* widened loads */
+ vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+ vstrbq_u16(pDataDestR, vecIn);
+ pDataDestR += 8;
+ pDataC = pDataC + pSrc->numCols * 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = pSrc->numRows & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecIn = vldrbq_gather_offset_u16(pDataC, vecOffs);
+ vstrbq_p_u16(pDataDestR, vecIn, p0);
+ }
+ pDataSrc += 1;
+ pDataDst += pSrc->numRows;
+ }
+ while (--i);
+
+ return (ARM_MATH_SUCCESS);
+}
+#else
+arm_status arm_mat_trans_q7(const arm_matrix_instance_q7 *pSrc, arm_matrix_instance_q7 *pDst)
+{
+ q7_t *pSrcA = pSrc->pData; /* input data matrix pointer */
+ q7_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of nRows */
+ uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+ uint16_t col, row = nRows, i = 0U; /* row and column loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if ((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows)) {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ } else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do {
+ /* The pointer pOut is set to starting address of the column being processed */
+ pOut = pDst->pData + i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+
+ while (col > 0U) {
+ /* Read and store the input element in the destination */
+ *pOut = *pSrcA++;
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while (row > 0U);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+
+/**
+ @} end of MatrixTrans group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c
new file mode 100644
index 0000000000..fe5243b8b4
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f16.c
@@ -0,0 +1,396 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_vec_mult_f16.c
+ * Description: Floating-point matrix and vector multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix and vector multiplication.
+ * @param[in] *pSrcMat points to the input matrix structure
+ * @param[in] *pVec points to input vector
+ * @param[out] *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_f16(
+ const arm_matrix_instance_f16 *pSrcMat,
+ const float16_t *pSrcVec,
+ float16_t *pDstVec)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const float16_t *pSrcA = pSrcMat->pData;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ float16_t *px;
+ int32_t row;
+ uint32_t blkCnt; /* loop counters */
+
+ row = numRows;
+ px = pDstVec;
+
+ /*
+ * compute 4 rows in parallel
+ */
+ while (row >= 4)
+ {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = pSrcVec;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF16Mve(acc0);
+ *px++ = vecAddAcrossF16Mve(acc1);
+ *px++ = vecAddAcrossF16Mve(acc2);
+ *px++ = vecAddAcrossF16Mve(acc3);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2)
+ {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = pSrcVec;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF16Mve(acc0);
+ *px++ = vecAddAcrossF16Mve(acc1);
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1)
+ {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = pSrcVec;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF16Mve(acc0);
+ }
+}
+#else
+void arm_mat_vec_mult_f16(const arm_matrix_instance_f16 *pSrcMat, const float16_t *pVec, float16_t *pDst)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const float16_t *pSrcA = pSrcMat->pData;
+ const float16_t *pInA1; /* input data matrix pointer A of Q31 type */
+ const float16_t *pInA2; /* input data matrix pointer A of Q31 type */
+ const float16_t *pInA3; /* input data matrix pointer A of Q31 type */
+ const float16_t *pInA4; /* input data matrix pointer A of Q31 type */
+ const float16_t *pInVec; /* input data matrix pointer B of Q31 type */
+ float16_t *px; /* Temporary output data matrix pointer */
+ uint16_t i, row, colCnt; /* loop counters */
+ float16_t matData, matData2, vecData, vecData2;
+
+
+ /* Process 4 rows at a time */
+ row = numRows >> 2;
+ i = 0u;
+ px = pDst;
+
+ /* The following loop performs the dot-product of each row in pSrcA with the vector */
+ /* row loop */
+ while (row > 0) {
+ /* For every row wise process, the pInVec pointer is set
+ ** to the starting address of the vector */
+ pInVec = pVec;
+
+ /* Initialize accumulators */
+ float16_t sum1 = 0.0f;
+ float16_t sum2 = 0.0f;
+ float16_t sum3 = 0.0f;
+ float16_t sum4 = 0.0f;
+
+ /* Loop unrolling: process 2 columns per iteration */
+ colCnt = numCols;
+
+ /* Initialize pointers to the starting address of the column being processed */
+ pInA1 = pSrcA + i;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ pInA4 = pInA3 + numCols;
+
+
+ // Main loop: matrix-vector multiplication
+ while (colCnt > 0u) {
+ // Read 2 values from vector
+ vecData = *(pInVec)++;
+ // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+ matData = *(pInA1)++;
+ sum1 += matData * vecData;
+ matData = *(pInA2)++;
+ sum2 += matData * vecData;
+ matData = *(pInA3)++;
+ sum3 += matData * vecData;
+ matData = *(pInA4)++;
+ sum4 += matData * vecData;
+
+ // Decrement the loop counter
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = sum1;
+ *px++ = sum2;
+ *px++ = sum3;
+ *px++ = sum4;
+
+ i = i + numCols * 4;
+
+ /* Decrement the row loop counter */
+ row--;
+ }
+
+ /* process any remaining rows */
+ row = numRows & 3u;
+ while (row > 0) {
+
+ float16_t sum = 0.0f;
+ pInVec = pVec;
+ pInA1 = pSrcA + i;
+
+ colCnt = numCols >> 1;
+
+ while (colCnt > 0) {
+ vecData = *(pInVec)++;
+ vecData2 = *(pInVec)++;
+ matData = *(pInA1)++;
+ matData2 = *(pInA1)++;
+ sum += matData * vecData;
+ sum += matData2 * vecData2;
+ colCnt--;
+ }
+ // process remainder of row
+ colCnt = numCols & 1u;
+ while (colCnt > 0) {
+ sum += *pInA1++ * *pInVec++;
+ colCnt--;
+ }
+
+ *px++ = sum;
+ i = i + numCols;
+ row--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
new file mode 100644
index 0000000000..012005b24c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_f32.c
@@ -0,0 +1,399 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_vec_mult_f32.c
+ * Description: Floating-point matrix and vector multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixVectMult Matrix Vector Multiplication
+ *
+ * Multiplies a matrix and a vector.
+ *
+ */
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix and vector multiplication.
+ * @param[in] *pSrcMat points to the input matrix structure
+ * @param[in] *pVec points to input vector
+ * @param[out] *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_f32(
+ const arm_matrix_instance_f32 *pSrcMat,
+ const float32_t *pSrcVec,
+ float32_t *pDstVec)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const float32_t *pSrcA = pSrcMat->pData;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ float32_t *px;
+ int32_t row;
+ uint32_t blkCnt; /* loop counters */
+
+ row = numRows;
+ px = pDstVec;
+
+ /*
+ * compute 4 rows in parallel
+ */
+ while (row >= 4)
+ {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = pSrcVec;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF32Mve(acc0);
+ *px++ = vecAddAcrossF32Mve(acc1);
+ *px++ = vecAddAcrossF32Mve(acc2);
+ *px++ = vecAddAcrossF32Mve(acc3);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2)
+ {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = pSrcVec;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF32Mve(acc0);
+ *px++ = vecAddAcrossF32Mve(acc1);
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1)
+ {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = pSrcVec;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vld1q(pSrcA0Vec);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ *px++ = vecAddAcrossF32Mve(acc0);
+ }
+}
+#else
+
+void arm_mat_vec_mult_f32(const arm_matrix_instance_f32 *pSrcMat, const float32_t *pVec, float32_t *pDst)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const float32_t *pSrcA = pSrcMat->pData;
+ const float32_t *pInA1; /* input data matrix pointer A of Q31 type */
+ const float32_t *pInA2; /* input data matrix pointer A of Q31 type */
+ const float32_t *pInA3; /* input data matrix pointer A of Q31 type */
+ const float32_t *pInA4; /* input data matrix pointer A of Q31 type */
+ const float32_t *pInVec; /* input data matrix pointer B of Q31 type */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t i, row, colCnt; /* loop counters */
+ float32_t matData, matData2, vecData, vecData2;
+
+
+ /* Process 4 rows at a time */
+ row = numRows >> 2;
+ i = 0u;
+ px = pDst;
+
+ /* The following loop performs the dot-product of each row in pSrcA with the vector */
+ /* row loop */
+ while (row > 0) {
+ /* For every row wise process, the pInVec pointer is set
+ ** to the starting address of the vector */
+ pInVec = pVec;
+
+ /* Initialize accumulators */
+ float32_t sum1 = 0.0f;
+ float32_t sum2 = 0.0f;
+ float32_t sum3 = 0.0f;
+ float32_t sum4 = 0.0f;
+
+ /* Loop unrolling: process 2 columns per iteration */
+ colCnt = numCols;
+
+ /* Initialize pointers to the starting address of the column being processed */
+ pInA1 = pSrcA + i;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ pInA4 = pInA3 + numCols;
+
+
+ // Main loop: matrix-vector multiplication
+ while (colCnt > 0u) {
+ // Read 2 values from vector
+ vecData = *(pInVec)++;
+ // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+ matData = *(pInA1)++;
+ sum1 += matData * vecData;
+ matData = *(pInA2)++;
+ sum2 += matData * vecData;
+ matData = *(pInA3)++;
+ sum3 += matData * vecData;
+ matData = *(pInA4)++;
+ sum4 += matData * vecData;
+
+ // Decrement the loop counter
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = sum1;
+ *px++ = sum2;
+ *px++ = sum3;
+ *px++ = sum4;
+
+ i = i + numCols * 4;
+
+ /* Decrement the row loop counter */
+ row--;
+ }
+
+ /* process any remaining rows */
+ row = numRows & 3u;
+ while (row > 0) {
+
+ float32_t sum = 0.0f;
+ pInVec = pVec;
+ pInA1 = pSrcA + i;
+
+ colCnt = numCols >> 1;
+ while (colCnt > 0) {
+ vecData = *(pInVec)++;
+ vecData2 = *(pInVec)++;
+ matData = *(pInA1)++;
+ matData2 = *(pInA1)++;
+ sum += matData * vecData;
+ sum += matData2 * vecData2;
+ colCnt--;
+ }
+ // process remainder of row
+ colCnt = numCols & 1u;
+
+
+ while (colCnt > 0) {
+ sum += *pInA1++ * *pInVec++;
+ colCnt--;
+ }
+
+ *px++ = sum;
+ i = i + numCols;
+ row--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
new file mode 100644
index 0000000000..1ea8c29599
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q15.c
@@ -0,0 +1,388 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_vec_mult_q15.c
+ * Description: Q15 matrix and vector multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix and vector multiplication.
+ * @param[in] *pSrcMat points to the input matrix structure
+ * @param[in] *pVec points to input vector
+ * @param[out] *pDst points to output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q15(
+ const arm_matrix_instance_q15 * pSrcMat,
+ const q15_t *pSrcVec,
+ q15_t *pDstVec)
+{
+ const q15_t *pMatSrc = pSrcMat->pData;
+ const q15_t *pMat0, *pMat1;
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ q15_t *px;
+ int32_t row;
+ uint16_t blkCnt; /* loop counters */
+
+ row = numRows;
+ px = pDstVec;
+
+ /*
+ * compute 3x64-bit accumulators per loop
+ */
+ while (row >= 3)
+ {
+ q15_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+ const q15_t *pMat2;
+ q15_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0, acc1, acc2;
+ q15x8_t vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+ pVec = pSrcVec;
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+ pMat2 = pMat1 + numCols;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pMat2Vec = pMat2;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 8;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 8;
+ vecMatA2 = vld1q(pMat2Vec);
+ pMat2Vec += 8;
+ vecIn = vld1q(pVec);
+ pVec += 8;
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecMatA2 = vld1q(pMat2Vec);
+ vecIn = vldrhq_z_s16(pVec, p0);
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+ }
+
+ *px++ = MVE_ASRL_SAT16(acc0, 15);
+ *px++ = MVE_ASRL_SAT16(acc1, 15);
+ *px++ = MVE_ASRL_SAT16(acc2, 15);
+
+ pMatSrc += numCols * 3;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 3;
+ }
+
+ /*
+ * process any remaining rows pair
+ */
+ if (row >= 2)
+ {
+ q15_t const *pMat0Vec, *pMat1Vec, *pVec;
+ q15_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0, acc1;
+ q15x8_t vecMatA0, vecMatA1, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 8;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 8;
+ vecIn = vld1q(pVec);
+ pVec += 8;
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecIn = vldrhq_z_s16(pVec, p0);
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ }
+
+ *px++ = MVE_ASRL_SAT16(acc0, 15);
+ *px++ = MVE_ASRL_SAT16(acc1, 15);
+
+ pMatSrc += numCols * 2;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 2;
+ }
+
+ if (row >= 1)
+ {
+ q15_t const *pMat0Vec, *pVec;
+ q15_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0;
+ q15x8_t vecMatA0, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+
+ acc0 = 0LL;
+
+ pMat0Vec = pMat0;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 8;
+ vecIn = vld1q(pVec);
+ pVec += 8;
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecIn = vldrhq_z_s16(pVec, p0);
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ }
+ *px++ = MVE_ASRL_SAT16(acc0, 15);
+ }
+}
+
+#else
+void arm_mat_vec_mult_q15(const arm_matrix_instance_q15 *pSrcMat, const q15_t *pVec, q15_t *pDst)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const q15_t *pSrcA = pSrcMat->pData;
+ const q15_t *pInA1; /* input data matrix pointer A of Q15 type */
+ const q15_t *pInA2; /* input data matrix pointer A of Q15 type */
+ const q15_t *pInA3; /* input data matrix pointer A of Q15 type */
+ const q15_t *pInA4; /* input data matrix pointer A of Q15 type */
+ const q15_t *pInVec; /* input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t i, row, colCnt; /* loop counters */
+ q31_t matData, matData2, vecData, vecData2;
+
+
+ /* Process 4 rows at a time */
+ row = numRows >> 2;
+ i = 0u;
+ px = pDst;
+
+ /* The following loop performs the dot-product of each row in pSrcA with the vector */
+ /* row loop */
+ while (row > 0) {
+ /* For every row wise process, the pInVec pointer is set
+ ** to the starting address of the vector */
+ pInVec = pVec;
+
+ /* Initialize accumulators */
+ q63_t sum1 = 0;
+ q63_t sum2 = 0;
+ q63_t sum3 = 0;
+ q63_t sum4 = 0;
+
+ /* Loop unrolling: process 2 columns per iteration */
+ colCnt = numCols >> 1;
+
+ /* Initialize pointers to the starting address of the column being processed */
+ pInA1 = pSrcA + i;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ pInA4 = pInA3 + numCols;
+
+ // Main loop: matrix-vector multiplication
+ while (colCnt > 0u) {
+ // Read 2 values from vector
+ vecData = read_q15x2_ia ((q15_t **) &pInVec);
+
+ // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+ matData = read_q15x2_ia ((q15_t **) &pInA1);
+ sum1 = __SMLALD(matData, vecData, sum1);
+ matData = read_q15x2_ia ((q15_t **) &pInA2);
+ sum2 = __SMLALD(matData, vecData, sum2);
+ matData = read_q15x2_ia ((q15_t **) &pInA3);
+ sum3 = __SMLALD(matData, vecData, sum3);
+ matData = read_q15x2_ia ((q15_t **) &pInA4);
+ sum4 = __SMLALD(matData, vecData, sum4);
+
+ // Decrement the loop counter
+ colCnt--;
+ }
+
+ /* process any remaining columns */
+ colCnt = numCols & 1u;
+ if (numCols & 1u) {
+ vecData = *pInVec++;
+ sum1 += (q63_t)*pInA1++ * vecData;
+ sum2 += (q63_t)*pInA2++ * vecData;
+ sum3 += (q63_t)*pInA3++ * vecData;
+ sum4 += (q63_t)*pInA4++ * vecData;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = (q15_t)(__SSAT((sum1 >> 15), 16));
+ *px++ = (q15_t)(__SSAT((sum2 >> 15), 16));
+ *px++ = (q15_t)(__SSAT((sum3 >> 15), 16));
+ *px++ = (q15_t)(__SSAT((sum4 >> 15), 16));
+
+ i = i + numCols * 4;
+
+ /* Decrement the row loop counter */
+ row--;
+ }
+
+ /* process any remaining rows */
+ row = numRows & 3u;
+ while (row > 0) {
+
+ q63_t sum = 0;
+ pInVec = pVec;
+ pInA1 = pSrcA + i;
+
+ // loop unrolling - process 4 elements at a time
+ colCnt = numCols >> 2;
+
+ while (colCnt > 0) {
+ vecData = read_q15x2_ia ((q15_t **) &pInVec);
+ vecData2 = read_q15x2_ia ((q15_t **) &pInVec);
+ matData = read_q15x2_ia ((q15_t **) &pInA1);
+ matData2 = read_q15x2_ia ((q15_t **) &pInA1);
+ sum = __SMLALD(matData, vecData, sum);
+ sum = __SMLALD(matData2, vecData2, sum);
+ colCnt--;
+ }
+
+ // process remainder of row
+ colCnt = numCols & 3u;
+ while (colCnt > 0) {
+ sum += (q63_t)*pInA1++ * *pInVec++;
+ colCnt--;
+ }
+ *px++ = (q15_t)(__SSAT((sum >> 15), 16));
+ i = i + numCols;
+ row--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
new file mode 100644
index 0000000000..a6efe06f11
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q31.c
@@ -0,0 +1,376 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_vec_mult_q31.c
+ * Description: Q31 matrix and vector multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix and vector multiplication.
+ * @param[in] *pSrcMat points to the input matrix structure
+ * @param[in] *pVec points to the input vector
+ * @param[out] *pDst points to the output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_mat_vec_mult_q31(
+ const arm_matrix_instance_q31 * pSrcMat,
+ const q31_t *pSrcVec,
+ q31_t *pDstVec)
+{
+ const q31_t *pMatSrc = pSrcMat->pData;
+ const q31_t *pMat0, *pMat1;
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ q31_t *px;
+ int32_t row;
+ uint16_t blkCnt; /* loop counters */
+
+ row = numRows;
+ px = pDstVec;
+
+ /*
+ * compute 3x64-bit accumulators per loop
+ */
+ while (row >= 3)
+ {
+ q31_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pVec;
+ const q31_t *pMat2;
+ q31_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0, acc1, acc2;
+ q31x4_t vecMatA0, vecMatA1, vecMatA2, vecIn;
+
+
+ pVec = pSrcVec;
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+ pMat2 = pMat1 + numCols;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+ acc2 = 0LL;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pMat2Vec = pMat2;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 4;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 4;
+ vecMatA2 = vld1q(pMat2Vec);
+ pMat2Vec += 4;
+ vecIn = vld1q(pVec);
+ pVec += 4;
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecMatA2 = vld1q(pMat2Vec);
+ vecIn = vldrwq_z_s32(pVec, p0);
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmlaldavaq(acc2, vecIn, vecMatA2);
+ }
+
+ *px++ = asrl(acc0, 31);
+ *px++ = asrl(acc1, 31);
+ *px++ = asrl(acc2, 31);
+
+ pMatSrc += numCols * 3;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 3;
+ }
+
+ /*
+ * process any remaining rows pair
+ */
+ if (row >= 2)
+ {
+ q31_t const *pMat0Vec, *pMat1Vec, *pVec;
+ q31_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0, acc1;
+ q31x4_t vecMatA0, vecMatA1, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+
+ acc0 = 0LL;
+ acc1 = 0LL;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 4;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 4;
+ vecIn = vld1q(pVec);
+ pVec += 4;
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecIn = vldrwq_z_s32(pVec, p0);
+
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmlaldavaq(acc1, vecIn, vecMatA1);
+ }
+
+ *px++ = asrl(acc0, 31);
+ *px++ = asrl(acc1, 31);
+
+ pMatSrc += numCols * 2;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 2;
+ }
+
+ if (row >= 1)
+ {
+ q31_t const *pMat0Vec, *pVec;
+ q31_t const *pSrcVecPtr = pSrcVec;
+ q63_t acc0;
+ q31x4_t vecMatA0, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+
+ acc0 = 0LL;
+
+ pMat0Vec = pMat0;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 4;
+ vecIn = vld1q(pVec);
+ pVec += 4;
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecIn = vldrwq_z_s32(pVec, p0);
+ acc0 = vmlaldavaq(acc0, vecIn, vecMatA0);
+ }
+
+ *px++ = asrl(acc0, 31);
+ }
+}
+#else
+void arm_mat_vec_mult_q31(const arm_matrix_instance_q31 *pSrcMat, const q31_t *pVec, q31_t *pDst)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const q31_t *pSrcA = pSrcMat->pData;
+ const q31_t *pInA1; /* input data matrix pointer A of Q31 type */
+ const q31_t *pInA2; /* input data matrix pointer A of Q31 type */
+ const q31_t *pInA3; /* input data matrix pointer A of Q31 type */
+ const q31_t *pInA4; /* input data matrix pointer A of Q31 type */
+ const q31_t *pInVec; /* input data matrix pointer B of Q31 type */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t i, row, colCnt; /* loop counters */
+ q31_t matData, matData2, vecData, vecData2;
+
+
+ /* Process 4 rows at a time */
+ row = numRows >> 2;
+ i = 0u;
+ px = pDst;
+
+ /* The following loop performs the dot-product of each row in pSrcA with the vector */
+ /* row loop */
+ while (row > 0) {
+ /* For every row wise process, the pInVec pointer is set
+ ** to the starting address of the vector */
+ pInVec = pVec;
+
+ /* Initialize accumulators */
+ q63_t sum1 = 0;
+ q63_t sum2 = 0;
+ q63_t sum3 = 0;
+ q63_t sum4 = 0;
+
+ /* Loop unrolling: process 2 columns per iteration */
+ colCnt = numCols;
+
+ /* Initialize pointers to the starting address of the column being processed */
+ pInA1 = pSrcA + i;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ pInA4 = pInA3 + numCols;
+
+
+ // Main loop: matrix-vector multiplication
+ while (colCnt > 0u) {
+ // Read 2 values from vector
+ vecData = *(pInVec)++;
+
+ // Read 8 values from the matrix - 2 values from each of 4 rows, and do multiply accumulate
+ matData = *(pInA1)++;
+ sum1 += (q63_t)matData * vecData;
+ matData = *(pInA2)++;
+ sum2 += (q63_t)matData * vecData;
+ matData = *(pInA3)++;
+ sum3 += (q63_t)matData * vecData;
+ matData = *(pInA4)++;
+ sum4 += (q63_t)matData * vecData;
+
+ // Decrement the loop counter
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = (q31_t)(sum1 >> 31);
+ *px++ = (q31_t)(sum2 >> 31);
+ *px++ = (q31_t)(sum3 >> 31);
+ *px++ = (q31_t)(sum4 >> 31);
+
+ i = i + numCols * 4;
+
+ /* Decrement the row loop counter */
+ row--;
+ }
+
+ /* process any remaining rows */
+ row = numRows & 3u;
+ while (row > 0) {
+
+ q63_t sum = 0;
+ pInVec = pVec;
+ pInA1 = pSrcA + i;
+
+ colCnt = numCols >> 1;
+
+ while (colCnt > 0) {
+ vecData = *(pInVec)++;
+ vecData2 = *(pInVec)++;
+ matData = *(pInA1)++;
+ matData2 = *(pInA1)++;
+ sum += (q63_t)matData * vecData;
+ sum += (q63_t)matData2 * vecData2;
+ colCnt--;
+ }
+
+ // process remainder of row
+ colCnt = numCols & 1u;
+ while (colCnt > 0) {
+ sum += (q63_t)*pInA1++ * *pInVec++;
+ colCnt--;
+ }
+
+ *px++ = (q31_t)(sum >> 31);
+ i = i + numCols;
+ row--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
new file mode 100644
index 0000000000..0f031d924c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/MatrixFunctions/arm_mat_vec_mult_q7.c
@@ -0,0 +1,420 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mat_vec_mult_q7.c
+ * Description: Q7 matrix and vector multiplication
+ *
+ * $Date: 23 April 2021
+ *
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/matrix_functions.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+
+
+/**
+ * @addtogroup MatrixVectMult
+ * @{
+ */
+
+/**
+ * @brief Q7 matrix and vector multiplication.
+ * @param[in] *pSrcMat points to the input matrix structure
+ * @param[in] *pVec points to the input vector
+ * @param[out] *pDst points to the output vector
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mat_vec_mult_q7(
+ const arm_matrix_instance_q7 * pSrcMat,
+ const q7_t *pSrcVec,
+ q7_t *pDstVec)
+{
+ const q7_t *pMatSrc = pSrcMat->pData;
+ const q7_t *pMat0, *pMat1;
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ q7_t *px;
+ int32_t row;
+ uint16_t blkCnt; /* loop counters */
+
+ row = numRows;
+ px = pDstVec;
+
+ /*
+ * compute 4x64-bit accumulators per loop
+ */
+ while (row >= 4)
+ {
+ q7_t const *pMat0Vec, *pMat1Vec, *pMat2Vec, *pMat3Vec, *pVec;
+ const q7_t *pMat2, *pMat3;
+ q7_t const *pSrcVecPtr = pSrcVec;
+ q31_t acc0, acc1, acc2, acc3;
+ q7x16_t vecMatA0, vecMatA1, vecMatA2, vecMatA3, vecIn;
+
+ pVec = pSrcVec;
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+ pMat2 = pMat1 + numCols;
+ pMat3 = pMat2 + numCols;
+
+ acc0 = 0L;
+ acc1 = 0L;
+ acc2 = 0L;
+ acc3 = 0L;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pMat2Vec = pMat2;
+ pMat3Vec = pMat3;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 4;
+ while (blkCnt > 0U)
+ {
+
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 16;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 16;
+ vecMatA2 = vld1q(pMat2Vec);
+ pMat2Vec += 16;
+ vecMatA3 = vld1q(pMat3Vec);
+ pMat3Vec += 16;
+ vecIn = vld1q(pVec);
+ pVec += 16;
+
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+ acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecMatA2 = vld1q(pMat2Vec);
+ vecMatA3 = vld1q(pMat3Vec);
+ vecIn = vldrbq_z_s8(pVec, p0);
+
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+ acc2 = vmladavaq(acc2, vecIn, vecMatA2);
+ acc3 = vmladavaq(acc3, vecIn, vecMatA3);
+ }
+
+ *px++ = __SSAT(acc0 >> 7, 8);
+ *px++ = __SSAT(acc1 >> 7, 8);
+ *px++ = __SSAT(acc2 >> 7, 8);
+ *px++ = __SSAT(acc3 >> 7, 8);
+
+ pMatSrc += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * process any remaining rows pair
+ */
+ if (row >= 2)
+ {
+ q7_t const *pMat0Vec, *pMat1Vec, *pVec;
+ q7_t const *pSrcVecPtr = pSrcVec;
+ q31_t acc0, acc1;
+ q7x16_t vecMatA0, vecMatA1, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+ pMat1 = pMat0 + numCols;
+
+ acc0 = 0;
+ acc1 = 0;
+
+ pMat0Vec = pMat0;
+ pMat1Vec = pMat1;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 4;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 16;
+ vecMatA1 = vld1q(pMat1Vec);
+ pMat1Vec += 16;
+ vecIn = vld1q(pVec);
+ pVec += 16;
+
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecMatA1 = vld1q(pMat1Vec);
+ vecIn = vldrbq_z_s8(pVec, p0);
+
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ acc1 = vmladavaq(acc1, vecIn, vecMatA1);
+ }
+
+ *px++ = __SSAT(acc0 >> 7, 8);
+ *px++ = __SSAT(acc1 >> 7, 8);
+
+ pMatSrc += numCols * 2;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 2;
+ }
+
+ if (row >= 1)
+ {
+ q7_t const *pMat0Vec, *pVec;
+ q7_t const *pSrcVecPtr = pSrcVec;
+ q31_t acc0;
+ q7x16_t vecMatA0, vecIn;
+
+ /*
+ * For every row wise process, the pInVec pointer is set
+ * to the starting address of the vector
+ */
+ pVec = pSrcVec;
+
+ /*
+ * Initialize the pointer pIn1 to point to the starting address of the column being processed
+ */
+ pMat0 = pMatSrc;
+
+ acc0 = 0LL;
+
+ pMat0Vec = pMat0;
+ pVec = pSrcVecPtr;
+
+ blkCnt = numCols >> 4;
+ while (blkCnt > 0U)
+ {
+ vecMatA0 = vld1q(pMat0Vec);
+ pMat0Vec += 16;
+ vecIn = vld1q(pVec);
+ pVec += 16;
+
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 0xF;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp8q(blkCnt);
+
+ vecMatA0 = vld1q(pMat0Vec);
+ vecIn = vldrbq_z_s8(pVec, p0);
+ acc0 = vmladavaq(acc0, vecIn, vecMatA0);
+ }
+ *px++ = __SSAT(acc0 >> 7, 8);
+ }
+}
+
+#else
+void arm_mat_vec_mult_q7(const arm_matrix_instance_q7 *pSrcMat, const q7_t *pVec, q7_t *pDst)
+{
+ uint32_t numRows = pSrcMat->numRows;
+ uint32_t numCols = pSrcMat->numCols;
+ const q7_t *pSrcA = pSrcMat->pData;
+ const q7_t *pInA1; /* input data matrix pointer of Q7 type */
+ const q7_t *pInA2; /* input data matrix pointer of Q7 type */
+ const q7_t *pInA3; /* input data matrix pointer of Q7 type */
+ const q7_t *pInA4; /* input data matrix pointer of Q7 type */
+ const q7_t *pInVec; /* input data vector pointer of Q7 type */
+ q7_t *px; /* output data pointer */
+ uint32_t i, row, colCnt; /* loop counters */
+
+ q31_t matData, matData2, vecData, vecData2;
+
+
+ /* Process 4 rows at a time */
+ row = numRows >> 2;
+ i = 0u;
+ px = pDst;
+
+
+
+ /* The following loop performs the dot-product of each row in pSrcA with the vector */
+ while (row > 0) {
+ /* For every row wise process, the pInVec pointer is set
+ ** to the starting address of the vector */
+ pInVec = pVec;
+
+ /* Initialize accumulators */
+ q31_t sum1 = 0;
+ q31_t sum2 = 0;
+ q31_t sum3 = 0;
+ q31_t sum4 = 0;
+
+ /* Loop unrolling: process 4 columns per iteration */
+ colCnt = numCols >> 2;
+
+ /* Initialize row pointers so we can track 4 rows at once */
+ pInA1 = pSrcA + i;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ pInA4 = pInA3 + numCols;
+
+
+ // Inner loop: matrix-vector multiplication
+
+ while (colCnt > 0u) {
+ // Read 4 values from vector
+ vecData = read_q7x4_ia ((q7_t **) &pInVec);
+ vecData2 = __SXTB16(__ROR(vecData, 8));
+ vecData = __SXTB16(vecData);
+ // Read 16 values from the matrix - 4 values from each of 4 rows, and do multiply accumulate
+ matData = read_q7x4_ia ((q7_t **) &pInA1);
+ matData2 = __SXTB16(__ROR(matData, 8));
+ matData = __SXTB16(matData);
+ sum1 = __SMLAD(matData, vecData, sum1);
+ sum1 = __SMLAD(matData2, vecData2, sum1);
+ matData = read_q7x4_ia ((q7_t **) &pInA2);
+ matData2 = __SXTB16(__ROR(matData, 8));
+ matData = __SXTB16(matData);
+ sum2 = __SMLAD(matData, vecData, sum2);
+ sum2 = __SMLAD(matData2, vecData2, sum2);
+ matData = read_q7x4_ia ((q7_t **) &pInA3);
+ matData2 = __SXTB16(__ROR(matData, 8));
+ matData = __SXTB16(matData);
+ sum3 = __SMLAD(matData, vecData, sum3);
+ sum3 = __SMLAD(matData2, vecData2, sum3);
+ matData = read_q7x4_ia ((q7_t **) &pInA4);
+ matData2 = __SXTB16(__ROR(matData, 8));
+ matData = __SXTB16(matData);
+ sum4 = __SMLAD(matData, vecData, sum4);
+ sum4 = __SMLAD(matData2, vecData2, sum4);
+
+ // Decrement the loop counter
+ colCnt--;
+ }
+
+ /* process any remaining columns */
+
+ colCnt = numCols & 3u;
+
+ while (colCnt > 0) {
+ vecData = *pInVec++;
+ sum1 += *pInA1++ * vecData;
+ sum2 += *pInA2++ * vecData;
+ sum3 += *pInA3++ * vecData;
+ sum4 += *pInA4++ * vecData;
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px++ = (q7_t)(__SSAT((sum1 >> 7), 8));
+ *px++ = (q7_t)(__SSAT((sum2 >> 7), 8));
+ *px++ = (q7_t)(__SSAT((sum3 >> 7), 8));
+ *px++ = (q7_t)(__SSAT((sum4 >> 7), 8));
+
+ i = i + numCols * 4;
+
+ /* Decrement the row loop counter */
+ row--;
+ }
+
+ /* process any remaining rows */
+ row = numRows & 3u;
+ while (row > 0) {
+
+ q31_t sum = 0;
+ pInVec = pVec;
+ pInA1 = pSrcA + i;
+
+ // loop unrolling - process 4 elements at a time
+ colCnt = numCols >> 2;
+
+ while (colCnt > 0) {
+ vecData = read_q7x4_ia ((q7_t **) &pInVec);
+ vecData2 = __SXTB16(__ROR(vecData, 8));
+ vecData = __SXTB16(vecData);
+ matData = read_q7x4_ia ((q7_t **) &pInA1);
+ matData2 = __SXTB16(__ROR(matData, 8));
+ matData = __SXTB16(matData);
+ sum = __SMLAD(matData, vecData, sum);
+ sum = __SMLAD(matData2, vecData2, sum);
+ colCnt--;
+ }
+
+ // process remainder of row
+ colCnt = numCols & 3u;
+ while (colCnt > 0) {
+ sum += *pInA1++ * *pInVec++;
+ colCnt--;
+ }
+ *px++ = (q7_t)(__SSAT((sum >> 7), 8));
+ i = i + numCols;
+ row--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/QuaternionMathFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/QuaternionMathFunctions.c
new file mode 100644
index 0000000000..7c48178eec
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/QuaternionMathFunctions.c
@@ -0,0 +1,34 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: QuaternionMathFunctions.c
+ * Description: Combination of all quaternion math function source files.
+ *
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_quaternion_norm_f32.c"
+#include "arm_quaternion_inverse_f32.c"
+#include "arm_quaternion_conjugate_f32.c"
+#include "arm_quaternion_normalize_f32.c"
+#include "arm_quaternion_product_single_f32.c"
+#include "arm_quaternion_product_f32.c"
+#include "arm_quaternion2rotation_f32.c"
+#include "arm_rotation2quaternion_f32.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c
new file mode 100644
index 0000000000..14d6669cda
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion2rotation_f32.c
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion2rotation_f32.c
+ * Description: Floating-point quaternion 2 rotation conversion
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatConv Quaternion conversions
+
+ Conversions between quaternion and rotation representations.
+ */
+
+/**
+ @ingroup QuatConv
+ */
+
+/**
+ @defgroup QuatRot Quaternion to Rotation
+
+ Conversions from quaternion to rotation.
+ */
+
+/**
+ @addtogroup QuatRot
+ @{
+ */
+
+/**
+ @brief Conversion of quaternion to equivalent rotation matrix.
+ @param[in] pInputQuaternions points to an array of normalized quaternions
+ @param[out] pOutputRotations points to an array of 3x3 rotations (in row order)
+ @param[in] nbQuaternions number of quaternions in the array
+ @return none.
+
+ @par
+ Format of rotation matrix
+
+
+ The quaternion a + ib + jc + kd is converted into rotation matrix:
+
+ a^2 + b^2 - c^2 - d^2 2bc - 2ad 2bd + 2ac
+ 2bc + 2ad a^2 - b^2 + c^2 - d^2 2cd - 2ab
+ 2bd - 2ac 2cd + 2ab a^2 - b^2 - c^2 + d^2
+
+ Rotation matrix is saved in row order : R00 R01 R02 R10 R11 R12 R20 R21 R22
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions,
+ float32_t *pOutputRotations,
+ uint32_t nbQuaternions)
+{
+ f32x4_t vec0,vec1, vec2 ,vec3;
+ float32_t q2q3, tmp1, tmp2 ;
+
+ for(uint32_t nb=0; nb < nbQuaternions; nb++)
+ {
+
+ // q0 q1 q2 q3
+ vec0 = vld1q(pInputQuaternions);
+
+ // q0^2 q1^2 q2^2 q3^2
+ vec1 = vmulq(vec0,vec0);
+
+ // q0^2 q1q0 q2q0 q3q0
+ vec2 = vmulq_n_f32(vec0, vgetq_lane(vec0,0));
+
+ // 2 (q0^2 q1q0 q2q0 q3q0)
+ vec2 = vmulq_n_f32(vec2, 2.0f);
+
+
+ // 2 q2q3
+ q2q3 = vgetq_lane(vec0,2) * vgetq_lane(vec0,3);
+ q2q3 = q2q3 * 2.0f;
+
+ // 2 (q0q1 q1^2 q2q1 q3q1)
+ vec3 = vmulq_n_f32(vec0, vgetq_lane(vec0,1));
+ vec3 = vmulq_n_f32(vec3, 2.0f);
+
+
+
+ vec0 = vsetq_lane(vgetq_lane(vec1,0) + vgetq_lane(vec1,1),vec0,0);
+ vec0 = vsetq_lane(vgetq_lane(vec0,0) - vgetq_lane(vec1,2),vec0,0);
+ vec0 = vsetq_lane(vgetq_lane(vec0,0) - vgetq_lane(vec1,3),vec0,0);
+ vec0 = vsetq_lane(vgetq_lane(vec3,2) - vgetq_lane(vec2,3),vec0,1);
+ vec0 = vsetq_lane(vgetq_lane(vec3,3) + vgetq_lane(vec2,2),vec0,2);
+ vec0 = vsetq_lane(vgetq_lane(vec3,2) + vgetq_lane(vec2,3),vec0,3);
+
+ vst1q(pOutputRotations, vec0);
+ pOutputRotations += 4;
+
+ tmp1 = vgetq_lane(vec1,0) - vgetq_lane(vec1,1);
+ tmp2 = vgetq_lane(vec1,2) - vgetq_lane(vec1,3);
+
+
+ vec0 = vsetq_lane(tmp1 + tmp2,vec0,0);
+ vec0 = vsetq_lane(q2q3 - vgetq_lane(vec2,1) ,vec0,1);
+ vec0 = vsetq_lane(vgetq_lane(vec3,3) - vgetq_lane(vec2,2),vec0,2);
+ vec0 = vsetq_lane(q2q3 + vgetq_lane(vec2,1) ,vec0,3);
+
+ vst1q(pOutputRotations, vec0);
+ pOutputRotations += 4;
+
+ *pOutputRotations = tmp1 - tmp2;
+ pOutputRotations ++;
+
+ pInputQuaternions += 4;
+ }
+}
+
+#else
+void arm_quaternion2rotation_f32(const float32_t *pInputQuaternions,
+ float32_t *pOutputRotations,
+ uint32_t nbQuaternions)
+{
+ for(uint32_t nb=0; nb < nbQuaternions; nb++)
+ {
+ float32_t q00 = SQ(pInputQuaternions[0 + nb * 4]);
+ float32_t q11 = SQ(pInputQuaternions[1 + nb * 4]);
+ float32_t q22 = SQ(pInputQuaternions[2 + nb * 4]);
+ float32_t q33 = SQ(pInputQuaternions[3 + nb * 4]);
+ float32_t q01 = pInputQuaternions[0 + nb * 4]*pInputQuaternions[1 + nb * 4];
+ float32_t q02 = pInputQuaternions[0 + nb * 4]*pInputQuaternions[2 + nb * 4];
+ float32_t q03 = pInputQuaternions[0 + nb * 4]*pInputQuaternions[3 + nb * 4];
+ float32_t q12 = pInputQuaternions[1 + nb * 4]*pInputQuaternions[2 + nb * 4];
+ float32_t q13 = pInputQuaternions[1 + nb * 4]*pInputQuaternions[3 + nb * 4];
+ float32_t q23 = pInputQuaternions[2 + nb * 4]*pInputQuaternions[3 + nb * 4];
+
+ float32_t xx = q00 + q11 - q22 - q33;
+ float32_t yy = q00 - q11 + q22 - q33;
+ float32_t zz = q00 - q11 - q22 + q33;
+ float32_t xy = 2*(q12 - q03);
+ float32_t xz = 2*(q13 + q02);
+ float32_t yx = 2*(q12 + q03);
+ float32_t yz = 2*(q23 - q01);
+ float32_t zx = 2*(q13 - q02);
+ float32_t zy = 2*(q23 + q01);
+
+ pOutputRotations[0 + nb * 9] = xx; pOutputRotations[1 + nb * 9] = xy; pOutputRotations[2 + nb * 9] = xz;
+ pOutputRotations[3 + nb * 9] = yx; pOutputRotations[4 + nb * 9] = yy; pOutputRotations[5 + nb * 9] = yz;
+ pOutputRotations[6 + nb * 9] = zx; pOutputRotations[7 + nb * 9] = zy; pOutputRotations[8 + nb * 9] = zz;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatRot group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c
new file mode 100644
index 0000000000..c97cffdb94
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_conjugate_f32.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_conjugate_f32.c
+ * Description: Floating-point quaternion conjugate
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatConjugate Quaternion Conjugate
+
+ Compute the conjugate of a quaternion.
+ */
+
+/**
+ @addtogroup QuatConjugate
+ @{
+ */
+
+/**
+ @brief Floating-point quaternion conjugates.
+ @param[in] pInputQuaternions points to the input vector of quaternions
+ @param[out] pConjugateQuaternions points to the output vector of conjugate quaternions
+ @param[in] nbQuaternions number of quaternions in each vector
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_quaternion_conjugate_f32(const float32_t *pInputQuaternions,
+ float32_t *pConjugateQuaternions,
+ uint32_t nbQuaternions)
+{
+ f32x4_t vec1;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ vec1 = vld1q(pInputQuaternions);
+
+
+ vec1 = vsetq_lane_f32(-vgetq_lane(vec1, 0),vec1,0);
+ vec1 = vnegq_f32(vec1);
+
+ vst1q(pConjugateQuaternions, vec1);
+
+
+ pInputQuaternions += 4;
+ pConjugateQuaternions += 4;
+ }
+}
+#else
+void arm_quaternion_conjugate_f32(const float32_t *pInputQuaternions,
+ float32_t *pConjugateQuaternions,
+ uint32_t nbQuaternions)
+{
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+
+ pConjugateQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0];
+ pConjugateQuaternions[4 * i + 1] = -pInputQuaternions[4 * i + 1];
+ pConjugateQuaternions[4 * i + 2] = -pInputQuaternions[4 * i + 2];
+ pConjugateQuaternions[4 * i + 3] = -pInputQuaternions[4 * i + 3];
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatConjugate group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c
new file mode 100644
index 0000000000..6c6f934e21
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_inverse_f32.c
@@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_inverse_f32.c
+ * Description: Floating-point quaternion inverse
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatInverse Quaternion Inverse
+
+ Compute the inverse of a quaternion.
+ */
+
+/**
+ @addtogroup QuatInverse
+ @{
+ */
+
+/**
+ @brief Floating-point quaternion inverse.
+ @param[in] pInputQuaternions points to the input vector of quaternions
+ @param[out] pInverseQuaternions points to the output vector of inverse quaternions
+ @param[in] nbQuaternions number of quaternions in each vector
+ @return none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions,
+ float32_t *pInverseQuaternions,
+ uint32_t nbQuaternions)
+{
+ f32x4_t vec1,vec2;
+ float32_t squaredSum;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+
+ vec1 = vld1q(pInputQuaternions);
+ vec2 = vmulq(vec1,vec1);
+ squaredSum = vecAddAcrossF32Mve(vec2);
+
+
+ vec1 = vmulq_n_f32(vec1, 1.0f / squaredSum);
+ vec1 = vsetq_lane_f32(-vgetq_lane(vec1, 0),vec1,0);
+ vec1 = vnegq_f32(vec1);
+
+ vst1q(pInverseQuaternions, vec1);
+
+
+ pInputQuaternions += 4;
+ pInverseQuaternions += 4;
+
+ }
+}
+
+#else
+void arm_quaternion_inverse_f32(const float32_t *pInputQuaternions,
+ float32_t *pInverseQuaternions,
+ uint32_t nbQuaternions)
+{
+ float32_t temp;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+
+ temp = SQ(pInputQuaternions[4 * i + 0]) +
+ SQ(pInputQuaternions[4 * i + 1]) +
+ SQ(pInputQuaternions[4 * i + 2]) +
+ SQ(pInputQuaternions[4 * i + 3]);
+
+ pInverseQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0] / temp;
+ pInverseQuaternions[4 * i + 1] = -pInputQuaternions[4 * i + 1] / temp;
+ pInverseQuaternions[4 * i + 2] = -pInputQuaternions[4 * i + 2] / temp;
+ pInverseQuaternions[4 * i + 3] = -pInputQuaternions[4 * i + 3] / temp;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatInverse group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c
new file mode 100644
index 0000000000..59d8559378
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_norm_f32.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_norm_f32.c
+ * Description: Floating-point quaternion Norm
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatNorm Quaternion Norm
+
+ Compute the norm of a quaternion.
+ */
+
+/**
+ @addtogroup QuatNorm
+ @{
+ */
+
+/**
+ @brief Floating-point quaternion Norm.
+ @param[in] pInputQuaternions points to the input vector of quaternions
+ @param[out] pNorms points to the output vector of norms
+ @param[in] nbQuaternions number of quaternions in the input vector
+ @return none
+ */
+
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_norm_f32(const float32_t *pInputQuaternions,
+ float32_t *pNorms,
+ uint32_t nbQuaternions)
+{
+ f32x4_t vec1;
+ float32_t squaredSum;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ vec1 = vld1q(pInputQuaternions);
+ vec1 = vmulq(vec1,vec1);
+ squaredSum = vecAddAcrossF32Mve(vec1);
+ arm_sqrt_f32(squaredSum,pNorms);
+
+ pInputQuaternions+= 4;
+ pNorms ++;
+ }
+
+}
+
+#else
+
+void arm_quaternion_norm_f32(const float32_t *pInputQuaternions,
+ float32_t *pNorms,
+ uint32_t nbQuaternions)
+{
+ float32_t temp;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ temp = SQ(pInputQuaternions[4 * i + 0]) +
+ SQ(pInputQuaternions[4 * i + 1]) +
+ SQ(pInputQuaternions[4 * i + 2]) +
+ SQ(pInputQuaternions[4 * i + 3]);
+ pNorms[i] = sqrtf(temp);
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatNorm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c
new file mode 100644
index 0000000000..56195c2033
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_normalize_f32.c
@@ -0,0 +1,106 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_normalize_f32.c
+ * Description: Floating-point quaternion normalization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatNormalized Quaternion normalization
+
+ Compute a normalized quaternion.
+ */
+
+/**
+ @addtogroup QuatNormalized
+ @{
+ */
+
+/**
+ @brief Floating-point normalization of quaternions.
+ @param[in] pInputQuaternions points to the input vector of quaternions
+ @param[out] pNormalizedQuaternions points to the output vector of normalized quaternions
+ @param[in] nbQuaternions number of quaternions in each vector
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_normalize_f32(const float32_t *pInputQuaternions,
+ float32_t *pNormalizedQuaternions,
+ uint32_t nbQuaternions)
+{
+ f32x4_t vec1,vec2;
+ float32_t squaredSum,norm;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ vec1 = vld1q(pInputQuaternions);
+ vec2 = vmulq(vec1,vec1);
+ squaredSum = vecAddAcrossF32Mve(vec2);
+ arm_sqrt_f32(squaredSum,&norm);
+ vec1 = vmulq_n_f32(vec1, 1.0f / norm);
+ vst1q(pNormalizedQuaternions, vec1);
+
+ pInputQuaternions += 4;
+ pNormalizedQuaternions += 4;
+
+ }
+}
+
+#else
+void arm_quaternion_normalize_f32(const float32_t *pInputQuaternions,
+ float32_t *pNormalizedQuaternions,
+ uint32_t nbQuaternions)
+{
+ float32_t temp;
+
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ temp = SQ(pInputQuaternions[4 * i + 0]) +
+ SQ(pInputQuaternions[4 * i + 1]) +
+ SQ(pInputQuaternions[4 * i + 2]) +
+ SQ(pInputQuaternions[4 * i + 3]);
+ temp = sqrtf(temp);
+
+ pNormalizedQuaternions[4 * i + 0] = pInputQuaternions[4 * i + 0] / temp;
+ pNormalizedQuaternions[4 * i + 1] = pInputQuaternions[4 * i + 1] / temp;
+ pNormalizedQuaternions[4 * i + 2] = pInputQuaternions[4 * i + 2] / temp;
+ pNormalizedQuaternions[4 * i + 3] = pInputQuaternions[4 * i + 3] / temp;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatNormalized group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c
new file mode 100644
index 0000000000..8f566edfaf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_f32.c
@@ -0,0 +1,148 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_product_f32.c
+ * Description: Floating-point quaternion product
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+/**
+ @ingroup groupQuaternionMath
+ */
+
+/**
+ @defgroup QuatProd Quaternion Product
+
+ Compute the product of quaternions.
+ */
+
+/**
+ @ingroup QuatProd
+ */
+
+/**
+ @defgroup QuatProdVect Elementwise Quaternion Product
+
+ Compute the elementwise product of quaternions.
+ */
+
+/**
+ @addtogroup QuatProdVect
+ @{
+ */
+
+/**
+ @brief Floating-point elementwise product two quaternions.
+ @param[in] qa first array of quaternions
+ @param[in] qb second array of quaternions
+ @param[out] qr elementwise product of quaternions
+ @param[in] nbQuaternions number of quaternions in the array
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_quaternion_product_f32(const float32_t *qa,
+ const float32_t *qb,
+ float32_t *qr,
+ uint32_t nbQuaternions)
+{
+ static uint32_t patternA[4] = { 0, 1, 0, 1 };
+ static uint32_t patternB[4] = { 3, 2, 3, 2 };
+ static uint32_t patternC[4] = { 3, 2, 1, 0 };
+ static float32_t signA[4] = { -1, -1, 1, 1 };
+
+ uint32x4_t vecA = vld1q_u32(patternA);
+ uint32x4_t vecB = vld1q_u32(patternB);
+ uint32x4_t vecC = vld1q_u32(patternC);
+ f32x4_t vecSignA = vld1q_f32(signA);
+
+ while (nbQuaternions > 0U)
+ {
+ f32x4_t vecTmpA, vecTmpB, vecAcc;
+
+ vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecA);
+ vecTmpB = vld1q(qb);
+ /*
+ * vcmul(r, [a1, a2, a1, a2], [b1, b2, b3, b4], 0)
+ */
+ vecAcc = vcmulq(vecTmpA, vecTmpB);
+ /*
+ * vcmla(r, [a1, a2, a1, a2], [b1, b2, b3, b4], 90)
+ */
+ vecAcc = vcmlaq_rot90(vecAcc, vecTmpA, vecTmpB);
+
+ vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecB);
+ vecTmpB = vldrwq_gather_shifted_offset_f32(qb, vecC);
+ /*
+ * build [-b4, -b3, b2, b1]
+ */
+ vecTmpB = vecTmpB * vecSignA;
+ /*
+ * vcmla(r, [a4, a3, a4, a3], [-b4, -b3, b2, b1], 270)
+ */
+ vecAcc = vcmlaq_rot270(vecAcc, vecTmpA, vecTmpB);
+ /*
+ * vcmla(r, [a4, a3, a4, a3], [-b4, -b3, b2, b1], 0)
+ */
+ vecAcc = vcmlaq(vecAcc, vecTmpA, vecTmpB);
+ /*
+ * store accumulator
+ */
+ vst1q_f32(qr, vecAcc);
+
+ /* move to next quaternion */
+ qa += 4;
+ qb += 4;
+ qr += 4;
+
+ nbQuaternions--;
+ }
+}
+
+#else
+
+void arm_quaternion_product_f32(const float32_t *qa,
+ const float32_t *qb,
+ float32_t *qr,
+ uint32_t nbQuaternions)
+{
+ for(uint32_t i=0; i < nbQuaternions; i++)
+ {
+ arm_quaternion_product_single_f32(qa, qb, qr);
+
+ qa += 4;
+ qb += 4;
+ qr += 4;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatProdVect group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c
new file mode 100644
index 0000000000..e72b3297ef
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_quaternion_product_single_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quaternion_product_single_f32.c
+ * Description: Floating-point quaternion product
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+
+/**
+ @ingroup QuatProd
+ */
+
+/**
+ @defgroup QuatProdSingle Quaternion Product
+
+ Compute the product of two quaternions.
+ */
+
+/**
+ @addtogroup QuatProdSingle
+ @{
+ */
+
+/**
+ @brief Floating-point product of two quaternions.
+ @param[in] qa first quaternion
+ @param[in] qb second quaternion
+ @param[out] qr product of two quaternions
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_quaternion_product_single_f32(const float32_t *qa,
+ const float32_t *qb,
+ float32_t *qr)
+{
+ static uint32_t patternA[4] = { 0, 1, 0, 1 };
+ static uint32_t patternB[4] = { 3, 2, 3, 2 };
+ static uint32_t patternC[4] = { 3, 2, 1, 0 };
+ static float32_t signA[4] = { -1, -1, 1, 1 };
+
+ uint32x4_t vecA = vld1q_u32(patternA);
+ uint32x4_t vecB = vld1q_u32(patternB);
+ uint32x4_t vecC = vld1q_u32(patternC);
+ f32x4_t vecSignA = vld1q_f32(signA);
+
+
+ f32x4_t vecTmpA, vecTmpB, vecAcc;
+
+ vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecA);
+ vecTmpB = vld1q_f32(qb);
+
+ vecAcc = vcmulq_f32(vecTmpA, vecTmpB);
+ vecAcc = vcmlaq_rot90_f32(vecAcc, vecTmpA, vecTmpB);
+
+ vecTmpA = vldrwq_gather_shifted_offset_f32(qa, vecB);
+ vecTmpB = vldrwq_gather_shifted_offset_f32(qb, vecC);
+
+ vecTmpB = vecTmpB * vecSignA;
+
+ vecAcc = vcmlaq_rot270_f32(vecAcc, vecTmpA, vecTmpB);
+ vecAcc = vcmlaq_f32(vecAcc, vecTmpA, vecTmpB);
+
+ vst1q_f32(qr, vecAcc);
+}
+
+#else
+void arm_quaternion_product_single_f32(const float32_t *qa,
+ const float32_t *qb,
+ float32_t *qr)
+{
+ qr[0] = qa[0] * qb[0] - qa[1] * qb[1] - qa[2] * qb[2] - qa[3] * qb[3];
+ qr[1] = qa[0] * qb[1] + qa[1] * qb[0] + qa[2] * qb[3] - qa[3] * qb[2];
+ qr[2] = qa[0] * qb[2] + qa[2] * qb[0] + qa[3] * qb[1] - qa[1] * qb[3];
+ qr[3] = qa[0] * qb[3] + qa[3] * qb[0] + qa[1] * qb[2] - qa[2] * qb[1];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of QuatProdSingle group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c
new file mode 100644
index 0000000000..1ce8a71191
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/QuaternionMathFunctions/arm_rotation2quaternion_f32.c
@@ -0,0 +1,224 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rotation2quaternion_f32.c
+ * Description: Floating-point rotation to quaternion conversion
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/quaternion_math_functions.h"
+#include
+
+#define RI(x,y) r[(3*(x) + (y))]
+
+
+/**
+ @ingroup QuatConv
+ */
+
+/**
+ @defgroup RotQuat Rotation to Quaternion
+
+ Conversions from rotation to quaternion.
+ */
+
+/**
+ @addtogroup RotQuat
+ @{
+ */
+
+/**
+ * @brief Conversion of a rotation matrix to an equivalent quaternion.
+ * @param[in] pInputRotations points to an array 3x3 rotation matrix (in row order)
+ * @param[out] pOutputQuaternions points to an array quaternions
+ * @param[in] nbQuaternions number of quaternions in the array
+ * @return none.
+ *
+ * q and -q are representing the same rotation. This ambiguity must be taken into
+ * account when using the output of this function.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+#define R00 vgetq_lane(q1,0)
+#define R01 vgetq_lane(q1,1)
+#define R02 vgetq_lane(q1,2)
+#define R10 vgetq_lane(q1,3)
+#define R11 vgetq_lane(q2,0)
+#define R12 vgetq_lane(q2,1)
+#define R20 vgetq_lane(q2,2)
+#define R21 vgetq_lane(q2,3)
+#define R22 ro22
+
+void arm_rotation2quaternion_f32(const float32_t *pInputRotations,
+ float32_t *pOutputQuaternions,
+ uint32_t nbQuaternions)
+{
+ float32_t ro22, trace;
+ f32x4_t q1,q2, q;
+
+ float32_t doubler;
+ float32_t s;
+
+ q = vdupq_n_f32(0.0f);
+
+ for(uint32_t nb=0; nb < nbQuaternions; nb++)
+ {
+ q1 = vld1q(pInputRotations);
+ pInputRotations += 4;
+
+ q2 = vld1q(pInputRotations);
+ pInputRotations += 4;
+
+ ro22 = *pInputRotations++;
+
+ trace = R00 + R11 + R22;
+
+
+ if (trace > 0)
+ {
+ (void)arm_sqrt_f32(trace + 1.0, &doubler) ; // invs=4*qw
+ doubler = 2*doubler;
+ s = 1.0 / doubler;
+
+ q1 = vmulq_n_f32(q1,s);
+ q2 = vmulq_n_f32(q2,s);
+
+ q[0] = 0.25 * doubler;
+ q[1] = R21 - R12;
+ q[2] = R02 - R20;
+ q[3] = R10 - R01;
+ }
+ else if ((R00 > R11) && (R00 > R22) )
+ {
+ (void)arm_sqrt_f32(1.0 + R00 - R11 - R22,&doubler); // invs=4*qx
+ doubler = 2*doubler;
+ s = 1.0 / doubler;
+
+ q1 = vmulq_n_f32(q1,s);
+ q2 = vmulq_n_f32(q2,s);
+
+ q[0] = R21 - R12;
+ q[1] = 0.25 * doubler;
+ q[2] = R01 + R10;
+ q[3] = R02 + R20;
+ }
+ else if (R11 > R22)
+ {
+ (void)arm_sqrt_f32(1.0 + R11 - R00 - R22,&doubler); // invs=4*qy
+ doubler = 2*doubler;
+ s = 1.0 / doubler;
+
+ q1 = vmulq_n_f32(q1,s);
+ q2 = vmulq_n_f32(q2,s);
+
+ q[0] = R02 - R20;
+ q[1] = R01 + R10;
+ q[2] = 0.25 * doubler;
+ q[3] = R12 + R21;
+ }
+ else
+ {
+ (void)arm_sqrt_f32(1.0 + R22 - R00 - R11,&doubler); // invs=4*qz
+ doubler = 2*doubler;
+ s = 1.0 / doubler;
+
+ q1 = vmulq_n_f32(q1,s);
+ q2 = vmulq_n_f32(q2,s);
+
+ q[0] = R10 - R01;
+ q[1] = R02 + R20;
+ q[2] = R12 + R21;
+ q[3] = 0.25 * doubler;
+ }
+
+ vst1q(pOutputQuaternions, q);
+ pOutputQuaternions += 4;
+
+ }
+}
+
+#else
+void arm_rotation2quaternion_f32(const float32_t *pInputRotations,
+ float32_t *pOutputQuaternions,
+ uint32_t nbQuaternions)
+{
+ for(uint32_t nb=0; nb < nbQuaternions; nb++)
+ {
+ const float32_t *r=&pInputRotations[nb*9];
+ float32_t *q=&pOutputQuaternions[nb*4];
+
+ float32_t trace = RI(0,0) + RI(1,1) + RI(2,2);
+
+ float32_t doubler;
+ float32_t s;
+
+
+
+ if (trace > 0)
+ {
+ doubler = sqrtf(trace + 1.0) * 2; // invs=4*qw
+ s = 1.0 / doubler;
+ q[0] = 0.25 * doubler;
+ q[1] = (RI(2,1) - RI(1,2)) * s;
+ q[2] = (RI(0,2) - RI(2,0)) * s;
+ q[3] = (RI(1,0) - RI(0,1)) * s;
+ }
+ else if ((RI(0,0) > RI(1,1)) && (RI(0,0) > RI(2,2)) )
+ {
+ doubler = sqrtf(1.0 + RI(0,0) - RI(1,1) - RI(2,2)) * 2; // invs=4*qx
+ s = 1.0 / doubler;
+ q[0] = (RI(2,1) - RI(1,2)) * s;
+ q[1] = 0.25 * doubler;
+ q[2] = (RI(0,1) + RI(1,0)) * s;
+ q[3] = (RI(0,2) + RI(2,0)) * s;
+ }
+ else if (RI(1,1) > RI(2,2))
+ {
+ doubler = sqrtf(1.0 + RI(1,1) - RI(0,0) - RI(2,2)) * 2; // invs=4*qy
+ s = 1.0 / doubler;
+ q[0] = (RI(0,2) - RI(2,0)) * s;
+ q[1] = (RI(0,1) + RI(1,0)) * s;
+ q[2] = 0.25 * doubler;
+ q[3] = (RI(1,2) + RI(2,1)) * s;
+ }
+ else
+ {
+ doubler = sqrtf(1.0 + RI(2,2) - RI(0,0) - RI(1,1)) * 2; // invs=4*qz
+ s = 1.0 / doubler;
+ q[0] = (RI(1,0) - RI(0,1)) * s;
+ q[1] = (RI(0,2) + RI(2,0)) * s;
+ q[2] = (RI(1,2) + RI(2,1)) * s;
+ q[3] = 0.25 * doubler;
+ }
+
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of RotQuat group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
new file mode 100644
index 0000000000..85f19b9eb5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctions.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: BayesFunctions.c
+ * Description: Combination of all SVM function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f32.c"
+#include "arm_svm_linear_predict_f32.c"
+#include "arm_svm_polynomial_init_f32.c"
+#include "arm_svm_polynomial_predict_f32.c"
+#include "arm_svm_rbf_init_f32.c"
+#include "arm_svm_rbf_predict_f32.c"
+#include "arm_svm_sigmoid_init_f32.c"
+#include "arm_svm_sigmoid_predict_f32.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
new file mode 100644
index 0000000000..74d2665200
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/SVMFunctionsF16.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: BayesFunctions.c
+ * Description: Combination of all SVM function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_svm_linear_init_f16.c"
+#include "arm_svm_linear_predict_f16.c"
+#include "arm_svm_polynomial_init_f16.c"
+#include "arm_svm_polynomial_predict_f16.c"
+#include "arm_svm_rbf_init_f16.c"
+#include "arm_svm_rbf_predict_f16.c"
+#include "arm_svm_sigmoid_init_f16.c"
+#include "arm_svm_sigmoid_predict_f16.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
new file mode 100644
index 0000000000..1e9756f2ca
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_init_f16.c
+ * Description: SVM Linear Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup linearsvm Linear SVM
+
+ Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S Parameters for the SVM function
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f16(arm_svm_linear_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes)
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
new file mode 100644
index 0000000000..d58b737fa1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_init_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_init_f32.c
+ * Description: SVM Linear Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+/**
+ * @defgroup groupSVM SVM Functions
+ *
+ */
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup linearsvm Linear SVM
+
+ Linear SVM classifier
+ */
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S Parameters for the SVM function
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @return none.
+ *
+ */
+
+
+void arm_svm_linear_init_f32(arm_svm_linear_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes)
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+}
+
+
+
+/**
+ * @} end of linearsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
new file mode 100644
index 0000000000..2eb25d58d7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f16.c
@@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_predict_f16.c
+ * Description: SVM Linear Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in] S Pointer to an instance of the linear SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f16(
+ const arm_svm_linear_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4)
+ {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc2,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc3,*pDualCoef++);
+
+ sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f16(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f16(acc0,acc1,*pDualCoef++);
+
+ sum += (_Float16)vecAddAcrossF16Mve(acc0);
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ sum += (_Float16)*pDualCoef++ * (_Float16)vecAddAcrossF16Mve(acc0);
+
+ }
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_linear_predict_f16(
+ const arm_svm_linear_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * dot;
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
new file mode 100644
index 0000000000..bb6f005cb5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_linear_predict_f32.c
@@ -0,0 +1,461 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_linear_predict_f32.c
+ * Description: SVM Linear Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup linearsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM linear prediction
+ * @param[in] S Pointer to an instance of the linear SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4)
+ {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc2,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc3,*pDualCoef++);
+
+ sum += vecAddAcrossF32Mve(acc0);
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ acc0 = vmulq_n_f32(acc0,*pDualCoef++);
+ acc0 = vfmaq_n_f32(acc0,acc1,*pDualCoef++);
+
+ sum += vecAddAcrossF32Mve(acc0);
+
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ sum += *pDualCoef++ * vecAddAcrossF32Mve(acc0);
+
+ }
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * dot;
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_linear_predict_f32(
+ const arm_svm_linear_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * dot;
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of linearsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
new file mode 100644
index 0000000000..b78d1bd6e3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f16.c
@@ -0,0 +1,103 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_init_f16.c
+ * Description: SVM Polynomial Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup polysvm Polynomial SVM
+
+ Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] degree Polynomial degree
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f16(arm_svm_polynomial_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ int32_t degree,
+ float16_t coef0,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->degree = degree;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
new file mode 100644
index 0000000000..aa1c418f9e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_init_f32.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_init_f32.c
+ * Description: SVM Polynomial Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup polysvm Polynomial SVM
+
+ Polynomial SVM classifier
+ */
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] degree Polynomial degree
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+
+void arm_svm_polynomial_init_f32(arm_svm_polynomial_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ int32_t degree,
+ float32_t coef0,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->degree = degree;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of polysvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
new file mode 100644
index 0000000000..b52a665c20
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f16.c
@@ -0,0 +1,336 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_predict_f16.c
+ * Description: SVM Polynomial Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in] S Pointer to an instance of the polynomial SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_polynomial_predict_f16(
+ const arm_svm_polynomial_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0),
+ S->degree),vctp16q(4));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
+ vctp16q(2));
+
+ pDualCoef += 2;
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vSum = vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f16
+ (vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0), S->degree),
+ vctp16q(1));
+ }
+ sum += (_Float16)vecAddAcrossF16Mve(vSum);
+
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_polynomial_predict_f16(
+ const arm_svm_polynomial_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + (_Float16)in[j]* (_Float16)*pSupport++;
+ }
+ sum += S->dualCoefficients[i] * (_Float16)arm_exponent_f16(S->gamma * dot + S->coef0, S->degree);
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
new file mode 100644
index 0000000000..3970e2eae1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_polynomial_predict_f32.c
@@ -0,0 +1,490 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_polynomial_predict_f32.c
+ * Description: SVM Polynomial Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include "arm_vec_math.h"
+#endif
+
+/**
+ * @addtogroup polysvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM polynomial prediction
+ * @param[in] S Pointer to an instance of the polynomial SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum = vfmaq_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
+ vctp32q(2));
+
+ pDualCoef += 2;
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vSum = vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ arm_vec_exponent_f32
+ (vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0), S->degree),
+ vctp32q(1));
+ }
+ sum += vecAddAcrossF32Mve(vSum);
+
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+ float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, S->gamma);
+ dotV = vaddq_f32(dotV, coef0);
+
+ dotV = arm_vec_exponent_f32(dotV,S->degree);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_polynomial_predict_f32(
+ const arm_svm_polynomial_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * arm_exponent_f32(S->gamma * dot + S->coef0, S->degree);
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+
+/**
+ * @} end of polysvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
new file mode 100644
index 0000000000..75978e028c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f16.c
@@ -0,0 +1,97 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_init_f16.c
+ * Description: SVM Radial Basis Function Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup rbfsvm RBF SVM
+
+ RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f16(arm_svm_rbf_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
new file mode 100644
index 0000000000..edd915f566
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_init_f32.c
@@ -0,0 +1,91 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_init_f32.c
+ * Description: SVM Radial Basis Function Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup rbfsvm RBF SVM
+
+ RBF SVM classifier
+ */
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM radial basis function instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the polynomial SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_rbf_init_f32(arm_svm_rbf_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->gamma = gamma;
+}
+
+
+
+/**
+ * @} end of rbfsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
new file mode 100644
index 0000000000..a1b9079fb1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f16.c
@@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_predict_f16.c
+ * Description: SVM Radial Basis Function Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_rbf_predict_f16(
+ const arm_svm_rbf_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF16Mve(acc0));
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)),vctp16q(4));
+ pDualCoef += 4;
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+ f16x8_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);;
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA, vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)), vctp16q(2));
+ pDualCoef += 2;
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA, vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA, vecDif;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vexpq_f16(vmulq_n_f16(vtmp, -S->gamma)), vctp16q(1));
+
+ }
+
+
+ sum += vecAddAcrossF16Mve(vSum);
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_rbf_predict_f16(
+ const arm_svm_rbf_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=00.f16;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0.0f16;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + SQ((_Float16)in[j] - (_Float16) *pSupport);
+ pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)expf(-(_Float16)S->gamma * dot);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
new file mode 100644
index 0000000000..cd321015b4
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_rbf_predict_f32.c
@@ -0,0 +1,523 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_rbf_predict_f32.c
+ * Description: SVM Radial Basis Function Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup rbfsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM rbf prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);;
+ vecDif = vsubq(vecIn, vecA);
+ acc2 = vfmaq(acc2, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc3 = vfmaq(acc3, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+
+ //sum += *pDualCoef++ * expf(-S->gamma * vecReduceF32Mve(acc0));
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)));
+ pDualCoef += 4;
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+ f32x4_t vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);;
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA, vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc1 = vfmaq(acc1, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(2));
+ pDualCoef += 2;
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA, vecDif;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA, vecDif;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ vecDif = vsubq(vecIn, vecA);
+ acc0 = vfmaq(acc0, vecDif, vecDif);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vexpq_f32(vmulq_n_f32(vtmp, -S->gamma)), vctp32q(1));
+
+ }
+
+
+ sum += vecAddAcrossF32Mve(vSum);
+ *pResult = S->classes[STEP(sum)];
+}
+
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t temp;
+ float32x4_t vec1;
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ temp = vsubq_f32(vec1, vec2a);
+ accuma = vmlaq_f32(accuma, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2b);
+ accumb = vmlaq_f32(accumb, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2c);
+ accumc = vmlaq_f32(accumc, temp, temp);
+
+ temp = vsubq_f32(vec1, vec2d);
+ accumd = vmlaq_f32(accumd, temp, temp);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + SQ(*pIn - *pSupporta), dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + SQ(*pIn - *pSupportb), dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + SQ(*pIn - *pSupportc), dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + SQ(*pIn - *pSupportd), dotV,3);
+
+ pSupporta++;
+ pSupportb++;
+ pSupportc++;
+ pSupportd++;
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, -S->gamma);
+ dotV = vexpq_f32(dotV);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ temp = vsubq_f32(vec1,vec2);
+ accum = vmlaq_f32(accum, temp,temp);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+
+ dot = dot + SQ(*pIn - *pSupport);
+ pIn++;
+ pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * expf(-S->gamma * dot);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_rbf_predict_f32(
+ const arm_svm_rbf_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + SQ(in[j] - *pSupport);
+ pSupport++;
+ }
+ sum += S->dualCoefficients[i] * expf(-S->gamma * dot);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+#endif
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of rbfsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
new file mode 100644
index 0000000000..89a31a9abe
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f16.c
@@ -0,0 +1,98 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f16.c
+ * Description: SVM Sigmoid Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup sigmoidsvm Sigmoid SVM
+
+ Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the rbf SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f16(arm_svm_sigmoid_instance_f16 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float16_t intercept,
+ const float16_t *dualCoefficients,
+ const float16_t *supportVectors,
+ const int32_t *classes,
+ float16_t coef0,
+ float16_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
new file mode 100644
index 0000000000..9a6ebfd3fa
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_init_f32.c
@@ -0,0 +1,92 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f32.c
+ * Description: SVM Sigmoid Instance Initialization
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+/**
+ @ingroup groupSVM
+ */
+
+/**
+ @defgroup sigmoidsvm Sigmoid SVM
+
+ Sigmoid SVM classifier
+ */
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+/**
+ * @brief SVM sigmoid instance init function
+ *
+ * Classes are integer used as output of the function (instead of having -1,1
+ * as class values).
+ *
+ * @param[in] S points to an instance of the rbf SVM structure.
+ * @param[in] nbOfSupportVectors Number of support vectors
+ * @param[in] vectorDimension Dimension of vector space
+ * @param[in] intercept Intercept
+ * @param[in] dualCoefficients Array of dual coefficients
+ * @param[in] supportVectors Array of support vectors
+ * @param[in] classes Array of 2 classes ID
+ * @param[in] coef0 coeff0 (scikit-learn terminology)
+ * @param[in] gamma gamma (scikit-learn terminology)
+ * @return none.
+ *
+ */
+
+void arm_svm_sigmoid_init_f32(arm_svm_sigmoid_instance_f32 *S,
+ uint32_t nbOfSupportVectors,
+ uint32_t vectorDimension,
+ float32_t intercept,
+ const float32_t *dualCoefficients,
+ const float32_t *supportVectors,
+ const int32_t *classes,
+ float32_t coef0,
+ float32_t gamma
+ )
+{
+ S->nbOfSupportVectors = nbOfSupportVectors;
+ S->vectorDimension = vectorDimension;
+ S->intercept = intercept;
+ S->dualCoefficients = dualCoefficients;
+ S->supportVectors = supportVectors;
+ S->classes = classes;
+ S->coef0 = coef0;
+ S->gamma = gamma;
+}
+
+
+/**
+ * @} end of sigmoidsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
new file mode 100644
index 0000000000..81e8146ceb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f16.c
@@ -0,0 +1,333 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f16.c
+ * Description: SVM Sigmoid Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+void arm_svm_sigmoid_predict_f16(
+ const arm_svm_sigmoid_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float16_t *pSupport = S->supportVectors;
+ const float16_t *pSrcA = pSupport;
+ const float16_t *pInA0;
+ const float16_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float16_t *pDualCoef = S->dualCoefficients;
+ _Float16 sum = S->intercept;
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float16_t *pInA2, *pInA3;
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1, acc2, acc3;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ acc2 = vdupq_n_f16(0.0f);
+ acc3 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 8;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 8;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),vctp16q(4));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float16_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f16x8_t vecIn, acc0, acc1;
+ float16_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+ acc1 = vdupq_n_f16(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 8;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrhq_z_f16(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+ vctp16q(2));
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f16x8_t vecIn, acc0;
+ float16_t const *pSrcA0Vec, *pInVec;
+ float16_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f16(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 3;
+ while (blkCnt > 0U) {
+ f16x8_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 8;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 8;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 7;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ f16x8_t vecA;
+
+ vecIn = vldrhq_z_f16(pInVec, p0);
+ vecA = vldrhq_z_f16(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f16x8_t vtmp = vuninitializedq_f16();
+ vtmp = vsetq_lane(vecAddAcrossF16Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f16(vSum, vld1q(pDualCoef),
+ vtanhq_f16(vaddq_n_f16(vmulq_n_f16(vtmp, S->gamma), S->coef0)),
+ vctp16q(1));
+ }
+ sum += vecAddAcrossF16Mve(vSum);
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+void arm_svm_sigmoid_predict_f16(
+ const arm_svm_sigmoid_instance_f16 *S,
+ const float16_t * in,
+ int32_t * pResult)
+{
+ _Float16 sum=S->intercept;
+ _Float16 dot=0.0f16;
+ uint32_t i,j;
+ const float16_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0.0f16;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + (_Float16)in[j] * (_Float16)*pSupport++;
+ }
+ sum += (_Float16)S->dualCoefficients[i] * (_Float16)tanhf((_Float16)S->gamma * dot + (_Float16)S->coef0);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
new file mode 100644
index 0000000000..cc072100d7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SVMFunctions/arm_svm_sigmoid_predict_f32.c
@@ -0,0 +1,487 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_svm_sigmoid_predict_f32.c
+ * Description: SVM Sigmoid Classifier
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/svm_functions.h"
+#include
+#include
+
+/**
+ * @addtogroup sigmoidsvm
+ * @{
+ */
+
+
+
+/**
+ * @brief SVM sigmoid prediction
+ * @param[in] S Pointer to an instance of the rbf SVM structure.
+ * @param[in] in Pointer to input vector
+ * @param[out] pResult Decision value
+ * @return none.
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ /* inlined Matrix x Vector function interleaved with dot prod */
+ uint32_t numRows = S->nbOfSupportVectors;
+ uint32_t numCols = S->vectorDimension;
+ const float32_t *pSupport = S->supportVectors;
+ const float32_t *pSrcA = pSupport;
+ const float32_t *pInA0;
+ const float32_t *pInA1;
+ uint32_t row;
+ uint32_t blkCnt; /* loop counters */
+ const float32_t *pDualCoef = S->dualCoefficients;
+ float32_t sum = S->intercept;
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+
+ row = numRows;
+
+ /*
+ * compute 4 rows in parrallel
+ */
+ while (row >= 4) {
+ const float32_t *pInA2, *pInA3;
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pSrcA2Vec, *pSrcA3Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1, acc2, acc3;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 4 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ pInA2 = pInA1 + numCols;
+ pInA3 = pInA2 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ acc2 = vdupq_n_f32(0.0f);
+ acc3 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+ pSrcA2Vec = pInA2;
+ pSrcA3Vec = pInA3;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vld1q(pSrcA2Vec);
+ pSrcA2Vec += 4;
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vld1q(pSrcA3Vec);
+ pSrcA3Vec += 4;
+ acc3 = vfmaq(acc3, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA2Vec, p0);
+ acc2 = vfmaq(acc2, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA3Vec, p0);
+ acc3 = vfmaq(acc3, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc2), vtmp, 2);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc3), vtmp, 3);
+
+ vSum =
+ vfmaq_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)));
+
+ pDualCoef += 4;
+
+ pSrcA += numCols * 4;
+ /*
+ * Decrement the row loop counter
+ */
+ row -= 4;
+ }
+
+ /*
+ * compute 2 rows in parrallel
+ */
+ if (row >= 2) {
+ float32_t const *pSrcA0Vec, *pSrcA1Vec, *pInVec;
+ f32x4_t vecIn, acc0, acc1;
+ float32_t const *pSrcVecPtr = in;
+
+ /*
+ * Initialize the pointers to 2 consecutive MatrixA rows
+ */
+ pInA0 = pSrcA;
+ pInA1 = pInA0 + numCols;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+ acc1 = vdupq_n_f32(0.0f);
+ pSrcA0Vec = pInA0;
+ pSrcA1Vec = pInA1;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vld1q(pSrcA1Vec);
+ pSrcA1Vec += 4;
+ acc1 = vfmaq(acc1, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ vecA = vldrwq_z_f32(pSrcA1Vec, p0);
+ acc1 = vfmaq(acc1, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc1), vtmp, 1);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+ vctp32q(2));
+
+ pSrcA += numCols * 2;
+ row -= 2;
+ }
+
+ if (row >= 1) {
+ f32x4_t vecIn, acc0;
+ float32_t const *pSrcA0Vec, *pInVec;
+ float32_t const *pSrcVecPtr = in;
+ /*
+ * Initialize the pointers to last MatrixA row
+ */
+ pInA0 = pSrcA;
+ /*
+ * Initialize the vector pointer
+ */
+ pInVec = pSrcVecPtr;
+ /*
+ * reset accumulators
+ */
+ acc0 = vdupq_n_f32(0.0f);
+
+ pSrcA0Vec = pInA0;
+
+ blkCnt = numCols >> 2;
+ while (blkCnt > 0U) {
+ f32x4_t vecA;
+
+ vecIn = vld1q(pInVec);
+ pInVec += 4;
+ vecA = vld1q(pSrcA0Vec);
+ pSrcA0Vec += 4;
+ acc0 = vfmaq(acc0, vecIn, vecA);
+
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = numCols & 3;
+ if (blkCnt > 0U) {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+ f32x4_t vecA;
+
+ vecIn = vldrwq_z_f32(pInVec, p0);
+ vecA = vldrwq_z_f32(pSrcA0Vec, p0);
+ acc0 = vfmaq(acc0, vecIn, vecA);
+ }
+ /*
+ * Sum the partial parts
+ */
+ f32x4_t vtmp = vuninitializedq_f32();
+ vtmp = vsetq_lane(vecAddAcrossF32Mve(acc0), vtmp, 0);
+
+ vSum =
+ vfmaq_m_f32(vSum, vld1q(pDualCoef),
+ vtanhq_f32(vaddq_n_f32(vmulq_n_f32(vtmp, S->gamma), S->coef0)),
+ vctp32q(1));
+ }
+ sum += vecAddAcrossF32Mve(vSum);
+
+ *pResult = S->classes[STEP(sum)];
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+#include "NEMath.h"
+
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum = S->intercept;
+
+ float32_t dot;
+ float32x4_t dotV;
+
+ float32x4_t accuma,accumb,accumc,accumd,accum;
+ float32x2_t accum2;
+ float32x4_t vec1;
+ float32x4_t coef0 = vdupq_n_f32(S->coef0);
+
+ float32x4_t vec2,vec2a,vec2b,vec2c,vec2d;
+
+ uint32_t blkCnt;
+ uint32_t vectorBlkCnt;
+
+ const float32_t *pIn = in;
+
+ const float32_t *pSupport = S->supportVectors;
+
+ const float32_t *pSupporta = S->supportVectors;
+ const float32_t *pSupportb;
+ const float32_t *pSupportc;
+ const float32_t *pSupportd;
+
+ pSupportb = pSupporta + S->vectorDimension;
+ pSupportc = pSupportb + S->vectorDimension;
+ pSupportd = pSupportc + S->vectorDimension;
+
+ const float32_t *pDualCoefs = S->dualCoefficients;
+
+ vectorBlkCnt = S->nbOfSupportVectors >> 2;
+ while (vectorBlkCnt > 0U)
+ {
+ accuma = vdupq_n_f32(0);
+ accumb = vdupq_n_f32(0);
+ accumc = vdupq_n_f32(0);
+ accumd = vdupq_n_f32(0);
+
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2a = vld1q_f32(pSupporta);
+ vec2b = vld1q_f32(pSupportb);
+ vec2c = vld1q_f32(pSupportc);
+ vec2d = vld1q_f32(pSupportd);
+
+ pIn += 4;
+ pSupporta += 4;
+ pSupportb += 4;
+ pSupportc += 4;
+ pSupportd += 4;
+
+ accuma = vmlaq_f32(accuma, vec1,vec2a);
+ accumb = vmlaq_f32(accumb, vec1,vec2b);
+ accumc = vmlaq_f32(accumc, vec1,vec2c);
+ accumd = vmlaq_f32(accumd, vec1,vec2d);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accuma),vget_high_f32(accuma));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,0);
+
+ accum2 = vpadd_f32(vget_low_f32(accumb),vget_high_f32(accumb));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,1);
+
+ accum2 = vpadd_f32(vget_low_f32(accumc),vget_high_f32(accumc));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,2);
+
+ accum2 = vpadd_f32(vget_low_f32(accumd),vget_high_f32(accumd));
+ dotV = vsetq_lane_f32(vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1),dotV,3);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,0) + *pIn * *pSupporta++, dotV,0);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,1) + *pIn * *pSupportb++, dotV,1);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,2) + *pIn * *pSupportc++, dotV,2);
+ dotV = vsetq_lane_f32(vgetq_lane_f32(dotV,3) + *pIn * *pSupportd++, dotV,3);
+
+ pIn++;
+
+ blkCnt -- ;
+ }
+
+ vec1 = vld1q_f32(pDualCoefs);
+ pDualCoefs += 4;
+
+ // To vectorize later
+ dotV = vmulq_n_f32(dotV, S->gamma);
+ dotV = vaddq_f32(dotV, coef0);
+
+ dotV = vtanhq_f32(dotV);
+
+ accum = vmulq_f32(vec1,dotV);
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ sum += vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+ pSupporta += 3*S->vectorDimension;
+ pSupportb += 3*S->vectorDimension;
+ pSupportc += 3*S->vectorDimension;
+ pSupportd += 3*S->vectorDimension;
+
+ vectorBlkCnt -- ;
+ }
+
+ pSupport = pSupporta;
+ vectorBlkCnt = S->nbOfSupportVectors & 3;
+
+ while (vectorBlkCnt > 0U)
+ {
+ accum = vdupq_n_f32(0);
+ dot = 0.0f;
+ pIn = in;
+
+ blkCnt = S->vectorDimension >> 2;
+ while (blkCnt > 0U)
+ {
+
+ vec1 = vld1q_f32(pIn);
+ vec2 = vld1q_f32(pSupport);
+ pIn += 4;
+ pSupport += 4;
+
+ accum = vmlaq_f32(accum, vec1,vec2);
+
+ blkCnt -- ;
+ }
+ accum2 = vpadd_f32(vget_low_f32(accum),vget_high_f32(accum));
+ dot = vget_lane_f32(accum2, 0) + vget_lane_f32(accum2, 1);
+
+
+ blkCnt = S->vectorDimension & 3;
+ while (blkCnt > 0U)
+ {
+ dot = dot + *pIn++ * *pSupport++;
+
+ blkCnt -- ;
+ }
+
+ sum += *pDualCoefs++ * tanhf(S->gamma * dot + S->coef0);
+ vectorBlkCnt -- ;
+ }
+
+ *pResult=S->classes[STEP(sum)];
+}
+#else
+void arm_svm_sigmoid_predict_f32(
+ const arm_svm_sigmoid_instance_f32 *S,
+ const float32_t * in,
+ int32_t * pResult)
+{
+ float32_t sum=S->intercept;
+ float32_t dot=0;
+ uint32_t i,j;
+ const float32_t *pSupport = S->supportVectors;
+
+ for(i=0; i < S->nbOfSupportVectors; i++)
+ {
+ dot=0;
+ for(j=0; j < S->vectorDimension; j++)
+ {
+ dot = dot + in[j]* *pSupport++;
+ }
+ sum += S->dualCoefficients[i] * tanhf(S->gamma * dot + S->coef0);
+ }
+ *pResult=S->classes[STEP(sum)];
+}
+
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of sigmoidsvm group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
new file mode 100644
index 0000000000..e6d0003590
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctions.c
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: StatisticsFunctions.c
+ * Description: Combination of all statistics function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_entropy_f32.c"
+#include "arm_entropy_f64.c"
+#include "arm_kullback_leibler_f32.c"
+#include "arm_kullback_leibler_f64.c"
+#include "arm_logsumexp_dot_prod_f32.c"
+#include "arm_logsumexp_f32.c"
+#include "arm_max_f32.c"
+#include "arm_max_q15.c"
+#include "arm_max_q31.c"
+#include "arm_max_q7.c"
+#include "arm_max_no_idx_f32.c"
+#include "arm_mean_f32.c"
+#include "arm_mean_q15.c"
+#include "arm_mean_q31.c"
+#include "arm_mean_q7.c"
+#include "arm_min_f32.c"
+#include "arm_min_q15.c"
+#include "arm_min_q31.c"
+#include "arm_min_q7.c"
+#include "arm_power_f32.c"
+#include "arm_power_q15.c"
+#include "arm_power_q31.c"
+#include "arm_power_q7.c"
+#include "arm_rms_f32.c"
+#include "arm_rms_q15.c"
+#include "arm_rms_q31.c"
+#include "arm_std_f32.c"
+#include "arm_std_q15.c"
+#include "arm_std_q31.c"
+#include "arm_var_f32.c"
+#include "arm_var_q15.c"
+#include "arm_var_q31.c"
+#include "arm_absmax_f32.c"
+#include "arm_absmax_q15.c"
+#include "arm_absmax_q31.c"
+#include "arm_absmax_q7.c"
+#include "arm_absmin_f32.c"
+#include "arm_absmin_q15.c"
+#include "arm_absmin_q31.c"
+#include "arm_absmin_q7.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctionsF16.c
new file mode 100644
index 0000000000..307fff830c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/StatisticsFunctionsF16.c
@@ -0,0 +1,42 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: StatisticsFunctions.c
+ * Description: Combination of all statistics function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_max_f16.c"
+#include "arm_min_f16.c"
+#include "arm_mean_f16.c"
+#include "arm_power_f16.c"
+#include "arm_rms_f16.c"
+#include "arm_std_f16.c"
+#include "arm_var_f16.c"
+#include "arm_entropy_f16.c"
+#include "arm_kullback_leibler_f16.c"
+#include "arm_logsumexp_dot_prod_f16.c"
+#include "arm_logsumexp_f16.c"
+#include "arm_max_no_idx_f16.c"
+#include "arm_absmax_f16.c"
+#include "arm_absmin_f16.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c
new file mode 100644
index 0000000000..c2de94e90d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f16.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmax_f16.c
+ * Description: Maximum value of a absolute values of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup AbsMax
+ @{
+ */
+
+/**
+ @brief Maximum value of absolute values of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmax_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ uint16_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ float16_t const *pSrcVec;
+ f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMIN);
+ float16_t maxValue = F16_ABSMIN;
+ uint16_t idx = blockSize;
+ uint16x8_t indexVec;
+ uint16x8_t curExtremIdxVec;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_u16((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u16(0);
+
+ pSrcVec = (float16_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_f16(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_f16(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+
+ p0 = vctp16q(blkCnt);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpgeq(curExtremValVec, maxValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = maxValue;
+}
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmax_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t cur_absmax, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0.0f16) ? out : -out; \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmax to next consecutive values one by one */ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax; \
+ /* compare for the extrema value */ \
+ if (cur_absmax > out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmax; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f16) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmax_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabsf(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = fabsf(*pSrc++);
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMax group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c
new file mode 100644
index 0000000000..1eeacdaf38
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_f32.c
@@ -0,0 +1,260 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmax_f32.c
+ * Description: Maximum value of absolute values of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup AbsMax Absolute Maximum
+
+ Computes the maximum value of absolute values of an array of data.
+ The function returns both the maximum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup AbsMax
+ @{
+ */
+
+/**
+ @brief Maximum value of absolute values of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmax_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkSize = blockSize;
+ f32x4_t vecSrc;
+ f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMIN);
+ float32_t maxValue = F32_ABSMIN;
+ uint32_t idx = blockSize;
+ uint32x4_t indexVec;
+ uint32x4_t curExtremIdxVec;
+ uint32_t curIdx = 0;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_wb_u32(&curIdx, 1);
+ curExtremIdxVec = vdupq_n_u32(0);
+
+ do {
+ mve_pred16_t p = vctp32q(blkSize);
+
+ vecSrc = vldrwq_z_f32((float32_t const *) pSrc, p);
+ vecSrc = vabsq_m(vuninitializedq_f32(), vecSrc, p);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(vecSrc, curExtremValVec, p);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ /* Does TP detection works here ?? */
+ indexVec = vidupq_wb_u32(&curIdx, 1);
+
+ blkSize -= 4;
+ pSrc += 4;
+ }
+ while (blkSize > 0);
+
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpgeq(curExtremValVec, maxValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = maxValue;
+}
+
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmax_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t cur_absmax, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0.0f) ? out : -out; \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmax to next consecutive values one by one */ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax; \
+ /* compare for the extrema value */ \
+ if (cur_absmax > out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmax; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0.0f) ? cur_absmax : -cur_absmax; \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmax_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabsf(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = fabsf(*pSrc++);
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMax group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c
new file mode 100644
index 0000000000..03e2bd0ce3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q15.c
@@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmax_q15.c
+ * Description: Maximum value of absolute values of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup AbsMax
+ @{
+ */
+
+/**
+ @brief Maximum value of absolute values of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmax_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q15x8_t extremValVec = vdupq_n_s16(Q15_ABSMIN);
+ q15_t maxValue = Q15_ABSMIN;
+ uint16x8_t indexVec;
+ uint16x8_t extremIdxVec;
+ mve_pred16_t p0;
+ uint16_t extremIdxArr[8];
+
+ indexVec = vidupq_u16(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+ q15x8_t extremIdxVal = vld1q_z_s16(pSrc, p);
+
+ extremIdxVal = vabsq(extremIdxVal);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+ indexVec += 8;
+ pSrc += 8;
+ blkCnt -= 8;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u16(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t cur_absmax, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q15_t)__QSUB16(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmax to next consecutive values one by one */ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax); \
+ /* compare for the extrema value */ \
+ if (cur_absmax > out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmax; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q15_t)__QSUB16(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmax_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+ pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMax group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c
new file mode 100644
index 0000000000..359d3a16ac
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q31.c
@@ -0,0 +1,236 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmax_q31.c
+ * Description: Maximum value of absolute values of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup AbsMax
+ @{
+ */
+
+/**
+ @brief Maximum value of absolute values of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmax_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q31x4_t extremValVec = vdupq_n_s32(Q31_ABSMIN);
+ q31_t maxValue = Q31_ABSMIN;
+ uint32x4_t indexVec;
+ uint32x4_t extremIdxVec;
+ mve_pred16_t p0;
+ uint32_t extremIdxArr[4];
+
+ indexVec = vidupq_u32(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+ q31x4_t extremIdxVal = vld1q_z_s32(pSrc, p);
+
+ extremIdxVal = vabsq(extremIdxVal);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+ indexVec += 4;
+ pSrc += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u32(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t cur_absmax, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q31_t)__QSUB(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmax to next consecutive values one by one */ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax); \
+ /* compare for the extrema value */ \
+ if (cur_absmax > out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmax; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q31_t)__QSUB(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmax_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+ pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMax group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c
new file mode 100644
index 0000000000..7b40cf566e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmax_q7.c
@@ -0,0 +1,294 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmax_q7.c
+ * Description: Maximum value of absolute values of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup AbsMax
+ @{
+ */
+
+/**
+ @brief Maximum value of absolute values of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include
+#include "arm_helium_utils.h"
+
+#define MAX_BLKSZ_S8 (UINT8_MAX+1)
+
+static void arm_small_blk_absmax_q7(
+ const q7_t * pSrc,
+ uint16_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q7x16_t extremValVec = vdupq_n_s8(Q7_ABSMIN);
+ q7_t maxValue = Q7_ABSMIN;
+ uint8x16_t indexVec;
+ uint8x16_t extremIdxVec;
+ mve_pred16_t p0;
+ uint8_t extremIdxArr[16];
+
+ indexVec = vidupq_u8(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp8q(blkCnt);
+ q7x16_t extremIdxVal = vld1q_z_s8(pSrc, p);
+
+ extremIdxVal = vabsq(extremIdxVal);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u8(extremIdxArr, indexVec, p0);
+
+ indexVec += 16;
+ pSrc += 16;
+ blkCnt -= 16;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u8(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
+ *pIndex = vminvq_u8(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+
+void arm_absmax_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t totalSize = blockSize;
+
+ if (totalSize <= MAX_BLKSZ_S8)
+ {
+ arm_small_blk_absmax_q7(pSrc, blockSize, pResult, pIndex);
+ }
+ else
+ {
+ uint32_t curIdx = 0;
+ q7_t curBlkExtr = Q7_MIN;
+ uint32_t curBlkPos = 0;
+ uint32_t curBlkIdx = 0;
+ /*
+ * process blocks of 255 elts
+ */
+ while (totalSize >= MAX_BLKSZ_S8)
+ {
+ const q7_t *curSrc = pSrc;
+
+ arm_small_blk_absmax_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
+ if (*pResult > curBlkExtr)
+ {
+ /*
+ * update partial extrema
+ */
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ curIdx++;
+ pSrc += MAX_BLKSZ_S8;
+ totalSize -= MAX_BLKSZ_S8;
+ }
+ /*
+ * remainder
+ */
+ arm_small_blk_absmax_q7(pSrc, totalSize, pResult, pIndex);
+ if (*pResult > curBlkExtr)
+ {
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
+ *pResult = curBlkExtr;
+ }
+}
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmax_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t cur_absmax, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q7_t)__QSUB8(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmax to next consecutive values one by one */ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax); \
+ /* compare for the extrema value */ \
+ if (cur_absmax > out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmax; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmax = *pSrc++; \
+ cur_absmax = (cur_absmax > 0) ? cur_absmax : (q7_t)__QSUB8(0, cur_absmax); \
+ if (cur_absmax > out) \
+ { \
+ out = cur_absmax; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmax_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+ pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMax group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c
new file mode 100644
index 0000000000..4e8dde279a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f16.c
@@ -0,0 +1,276 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmin_f16.c
+ * Description: Minimum value of absolute values of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup AbsMin
+ @{
+ */
+
+/**
+ @brief Minimum value of absolute values of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmin_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ uint16_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ float16_t const *pSrcVec;
+ f16x8_t curExtremValVec = vdupq_n_f16(F16_ABSMAX);
+ float16_t minValue = F16_ABSMAX;
+ uint16_t idx = blockSize;
+ uint16x8_t indexVec;
+ uint16x8_t curExtremIdxVec;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_u16((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u16(0);
+
+ pSrcVec = (float16_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_f16(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0U)
+ {
+ p0 = vctp16q(blkCnt);
+
+ vecSrc = vldrhq_f16(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminnmvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmin_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t cur_absmin, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0.0f16) ? out : -out; \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmin to next consecutive values one by one */ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin; \
+ /* compare for the extrema value */ \
+ if (cur_absmin < out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmin; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f16) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmin_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabsf(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = fabsf(*pSrc++);
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMin group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c
new file mode 100644
index 0000000000..df711ac9e6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_f32.c
@@ -0,0 +1,279 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmin_f32.c
+ * Description: Minimum value of absolute values of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup AbsMin Absolute Minimum
+
+ Computes the minimum value of absolute values of an array of data.
+ The function returns both the minimum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup AbsMin
+ @{
+ */
+
+/**
+ @brief Minimum value of absolute values of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+void arm_absmin_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ f32x4_t vecSrc;
+ float32_t const *pSrcVec;
+ f32x4_t curExtremValVec = vdupq_n_f32(F32_ABSMAX);
+ float32_t minValue = F32_ABSMAX;
+ uint32_t idx = blockSize;
+ uint32x4_t indexVec;
+ uint32x4_t curExtremIdxVec;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_u32((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u32(0);
+
+ pSrcVec = (float32_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0)
+ {
+ vecSrc = vldrwq_f32(pSrcVec);
+ pSrcVec += 4;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 3;
+ if (blkCnt > 0)
+ {
+ p0 = vctp32q(blkCnt);
+
+ vecSrc = vldrwq_f32(pSrcVec);
+ pSrcVec += 4;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminnmvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_LOOPUNROLL)
+void arm_absmin_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t cur_absmin, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0.0f) ? out : -out; \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmin to next consecutive values one by one */ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin; \
+ /* compare for the extrema value */ \
+ if (cur_absmin < out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmin; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0.0f) ? cur_absmin : -cur_absmin; \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmin_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = fabsf(*pSrc++);
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = fabsf(*pSrc++);
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+#endif /* defined(ARM_MATH_LOOPUNROLL) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMin group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c
new file mode 100644
index 0000000000..525a13692b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q15.c
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmin_q15.c
+ * Description: Minimum value of absolute values of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup AbsMin
+ @{
+ */
+
+/**
+ @brief Minimum value of absolute values of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmin_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ uint16_t blkCnt; /* loop counters */
+ q15x8_t vecSrc;
+ q15_t const *pSrcVec;
+ q15x8_t curExtremValVec = vdupq_n_s16(Q15_ABSMAX);
+ q15_t minValue = Q15_ABSMAX;
+ uint16_t idx = blockSize;
+ uint16x8_t indexVec;
+ uint16x8_t curExtremIdxVec;
+ uint32_t startIdx = 0;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_wb_u16(&startIdx, 1);
+ curExtremIdxVec = vdupq_n_u16(0);
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = vidupq_wb_u16(&startIdx, 1);
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrcVec);
+ pSrcVec += 8;
+ vecSrc = vabsq(vecSrc);
+
+ p0 = vctp16q(blkCnt);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to min possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+ /*
+ * Get min index which is thus for a min value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t cur_absmin, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q15_t)__QSUB16(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmin to next consecutive values one by one */ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin); \
+ /* compare for the extrema value */ \
+ if (cur_absmin < out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmin; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q15_t)__QSUB16(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmin_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q15_t) 0x8000) ? 0x7fff : -*pSrc);
+ pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMin group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c
new file mode 100644
index 0000000000..808f6a0032
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q31.c
@@ -0,0 +1,269 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmin_q31.c
+ * Description: Minimum value of absolute values of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup AbsMin
+ @{
+ */
+
+/**
+ @brief Minimum value of absolute values of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_absmin_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ uint16_t blkCnt; /* loop counters */
+ q31x4_t vecSrc;
+ q31_t const *pSrcVec;
+ q31x4_t curExtremValVec = vdupq_n_s32(Q31_ABSMAX);
+ q31_t minValue = Q31_ABSMAX;
+ uint16_t idx = blockSize;
+ uint32x4_t indexVec;
+ uint32x4_t curExtremIdxVec;
+ uint32_t startIdx = 0;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_wb_u32(&startIdx, 1);
+ curExtremIdxVec = vdupq_n_u32(0);
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_s32(pSrcVec);
+ pSrcVec += 4;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = vidupq_wb_u32(&startIdx, 1);
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 3;
+ if (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_s32(pSrcVec);
+ pSrcVec += 4;
+ vecSrc = vabsq(vecSrc);
+
+ p0 = vctp32q(blkCnt);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to min possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+ /*
+ * Get min index which is thus for a min value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t cur_absmin, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q31_t)__QSUB(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmin to next consecutive values one by one */ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin); \
+ /* compare for the extrema value */ \
+ if (cur_absmin < out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmin; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q31_t)__QSUB(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmin_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == INT32_MIN) ? INT32_MAX : -*pSrc);
+ pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMin group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c
new file mode 100644
index 0000000000..3d6345ff2d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_absmin_q7.c
@@ -0,0 +1,322 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_absmin_q7.c
+ * Description: Minimum value of absolute values of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup AbsMin
+ @{
+ */
+
+/**
+ @brief Minimum value of absolute values of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include
+#include "arm_helium_utils.h"
+
+#define MAX_BLKSZ_S8 (UINT8_MAX+1)
+
+static void arm_small_blk_absmin_q7(
+ const q7_t *pSrc,
+ uint32_t blockSize,
+ q7_t *pResult,
+ uint32_t *pIndex)
+{
+ uint16_t blkCnt; /* loop counters */
+ q7x16_t vecSrc;
+ q7_t const *pSrcVec;
+ q7x16_t curExtremValVec = vdupq_n_s8(Q7_ABSMAX);
+ q7_t minValue = Q7_ABSMAX;
+ uint16_t idx = blockSize - 1;
+ uint8x16_t indexVec;
+ uint8x16_t curExtremIdxVec;
+ uint32_t startIdx = 0;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_wb_u8(&startIdx, 1);
+ curExtremIdxVec = vdupq_n_u8(0);
+
+ pSrcVec = (q7_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrcVec);
+ pSrcVec += 16;
+ vecSrc = vabsq(vecSrc);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = vidupq_wb_u8(&startIdx, 1);
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 0xF;
+ if (blkCnt > 0U)
+ {
+ vecSrc = vld1q(pSrcVec);
+ pSrcVec += 16;
+ vecSrc = vabsq(vecSrc);
+
+ p0 = vctp8q(blkCnt);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to min possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ idx = vminvq_p_u8(idx, curExtremIdxVec, p0);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+
+void arm_absmin_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t totalSize = blockSize;
+
+ if (totalSize <= MAX_BLKSZ_S8)
+ {
+ arm_small_blk_absmin_q7(pSrc, blockSize, pResult, pIndex);
+ }
+ else
+ {
+ uint32_t curIdx = 0;
+ q7_t curBlkExtr = Q7_MAX;
+ uint32_t curBlkPos = 0;
+ uint32_t curBlkIdx = 0;
+ /*
+ * process blocks of 255 elts
+ */
+ while (totalSize >= MAX_BLKSZ_S8)
+ {
+ const q7_t *curSrc = pSrc;
+
+ arm_small_blk_absmin_q7(curSrc, MAX_BLKSZ_S8, pResult, pIndex);
+ if (*pResult < curBlkExtr)
+ {
+ /*
+ * update partial extrema
+ */
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ curIdx++;
+ pSrc += MAX_BLKSZ_S8;
+ totalSize -= MAX_BLKSZ_S8;
+ }
+ /*
+ * remainder
+ */
+ arm_small_blk_absmin_q7(pSrc, totalSize, pResult, pIndex);
+ if (*pResult < curBlkExtr)
+ {
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ *pIndex = curBlkIdx * MAX_BLKSZ_S8 + curBlkPos;
+ *pResult = curBlkExtr;
+ }
+}
+
+#else
+#if defined(ARM_MATH_DSP)
+void arm_absmin_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t cur_absmin, out; /* Temporary variables to store the output value. */\
+ uint32_t blkCnt, outIndex; /* Loop counter */ \
+ uint32_t index; /* index of maximum value */ \
+ \
+ /* Initialize index value to zero. */ \
+ outIndex = 0U; \
+ /* Load first input value that act as reference value for comparision */ \
+ out = *pSrc++; \
+ out = (out > 0) ? out : (q7_t)__QSUB8(0, out); \
+ /* Initialize index of extrema value. */ \
+ index = 0U; \
+ \
+ /* Loop unrolling: Compute 4 outputs at a time */ \
+ blkCnt = (blockSize - 1U) >> 2U; \
+ \
+ while (blkCnt > 0U) \
+ { \
+ /* Initialize cur_absmin to next consecutive values one by one */ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin); \
+ /* compare for the extrema value */ \
+ if (cur_absmin < out) \
+ { \
+ /* Update the extrema value and it's index */ \
+ out = cur_absmin; \
+ outIndex = index + 1U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 2U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 3U; \
+ } \
+ \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = index + 4U; \
+ } \
+ \
+ index += 4U; \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Loop unrolling: Compute remaining outputs */ \
+ blkCnt = (blockSize - 1U) % 4U; \
+ \
+ \
+ while (blkCnt > 0U) \
+ { \
+ cur_absmin = *pSrc++; \
+ cur_absmin = (cur_absmin > 0) ? cur_absmin : (q7_t)__QSUB8(0, cur_absmin); \
+ if (cur_absmin < out) \
+ { \
+ out = cur_absmin; \
+ outIndex = blockSize - blkCnt; \
+ } \
+ \
+ /* Decrement loop counter */ \
+ blkCnt--; \
+ } \
+ \
+ /* Store the extrema value and it's index into destination pointers */ \
+ *pResult = out; \
+ *pIndex = outIndex;
+}
+#else
+void arm_absmin_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+ pSrc++;
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = (*pSrc > 0) ? *pSrc : ((*pSrc == (q7_t) 0x80) ? (q7_t) 0x7f : -*pSrc);
+ pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_DSP) */
+#endif /* defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) */
+/**
+ @} end of AbsMin group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c
new file mode 100644
index 0000000000..584cea9694
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f16.c
@@ -0,0 +1,140 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f16.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup Entropy Entropy
+
+ Computes the entropy of a distribution
+
+ */
+
+/**
+ * @addtogroup Entropy
+ * @{
+ */
+
+
+/**
+ * @brief Entropy
+ *
+ * @param[in] pSrcA Array of input values.
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Entropy -Sum(p ln p)
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ _Float16 accum=0.0f16,p;
+
+
+ blkCnt = blockSize;
+
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+ f16x8_t vecIn = vld1q(pSrcA);
+
+ vSum = vaddq_f16(vSum, vmulq(vecIn, vlogq_f16(vecIn)));
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 8;
+ blkCnt --;
+ }
+
+ accum = vecAddAcrossF16Mve(vSum);
+
+ /* Tail */
+ blkCnt = blockSize & 0x7;
+ while(blkCnt > 0)
+ {
+ p = *pSrcA++;
+ accum += p * logf(p);
+
+ blkCnt--;
+
+ }
+
+ return (-accum);
+}
+
+#else
+
+float16_t arm_entropy_f16(const float16_t * pSrcA,uint32_t blockSize)
+{
+ const float16_t *pIn;
+ uint32_t blkCnt;
+ _Float16 accum, p;
+
+ pIn = pSrcA;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ p = *pIn++;
+ accum += p * logf(p);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Entropy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c
new file mode 100644
index 0000000000..15fde18c5e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f32.c
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f32.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup Entropy
+ * @{
+ */
+
+
+/**
+ * @brief Entropy
+ *
+ * @param[in] pSrcA Array of input values.
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Entropy -Sum(p ln p)
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t accum=0.0f,p;
+
+
+ blkCnt = blockSize;
+
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ f32x4_t vecIn = vld1q(pSrcA);
+
+ vSum = vaddq_f32(vSum, vmulq(vecIn, vlogq_f32(vecIn)));
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 4;
+ blkCnt --;
+ }
+
+ accum = vecAddAcrossF32Mve(vSum);
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+ while(blkCnt > 0)
+ {
+ p = *pSrcA++;
+ accum += p * logf(p);
+
+ blkCnt--;
+
+ }
+
+ return (-accum);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+
+float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
+{
+ const float32_t *pIn;
+ uint32_t blkCnt;
+ float32_t accum, p;
+
+ float32x4_t accumV;
+ float32x2_t accumV2;
+ float32x4_t tmpV, tmpV2;
+
+ pIn = pSrcA;
+
+ accum = 0.0f;
+ accumV = vdupq_n_f32(0.0f);
+
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ tmpV = vld1q_f32(pIn);
+ pIn += 4;
+
+ tmpV2 = vlogq_f32(tmpV);
+ accumV = vmlaq_f32(accumV, tmpV, tmpV2);
+
+ blkCnt--;
+
+ }
+
+ accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
+ accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
+
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ p = *pIn++;
+ accum += p * logf(p);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+
+#else
+float32_t arm_entropy_f32(const float32_t * pSrcA,uint32_t blockSize)
+{
+ const float32_t *pIn;
+ uint32_t blkCnt;
+ float32_t accum, p;
+
+ pIn = pSrcA;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ p = *pIn++;
+ accum += p * logf(p);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Entropy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c
new file mode 100644
index 0000000000..f3ef913a6e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_entropy_f64.c
@@ -0,0 +1,73 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f64.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+/**
+ * @addtogroup Entropy
+ * @{
+ */
+
+/**
+ * @brief Entropy
+ *
+ * @param[in] pSrcA Array of input values.
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Entropy -Sum(p ln p)
+ *
+ */
+
+float64_t arm_entropy_f64(const float64_t * pSrcA, uint32_t blockSize)
+{
+ const float64_t *pIn;
+ uint32_t blkCnt;
+ float64_t accum, p;
+
+ pIn = pSrcA;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ p = *pIn++;
+
+ accum += p * log(p);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+
+/**
+ * @} end of Entropy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
new file mode 100644
index 0000000000..6b3c3eb6c7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f16.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup Kullback-Leibler Kullback-Leibler divergence
+
+ Computes the Kullback-Leibler divergence between two distributions
+
+ */
+
+
+/**
+ * @addtogroup Kullback-Leibler
+ * @{
+ */
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * Distribution A may contain 0 with Neon version.
+ * Result will be right but some exception flags will be set.
+ *
+ * Distribution B must not contain 0 probability.
+ *
+ * @param[in] *pSrcA points to an array of input values for probaility distribution A.
+ * @param[in] *pSrcB points to an array of input values for probaility distribution B.
+ * @param[in] blockSize number of samples in the input array.
+ * @return Kullback-Leibler divergence D(A || B)
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ _Float16 accum, pA,pB;
+
+
+ blkCnt = blockSize;
+
+ accum = 0.0f16;
+
+ f16x8_t vSum = vdupq_n_f16(0.0f);
+ blkCnt = blockSize >> 3;
+ while(blkCnt > 0)
+ {
+ f16x8_t vecA = vld1q(pSrcA);
+ f16x8_t vecB = vld1q(pSrcB);
+ f16x8_t vRatio;
+
+ vRatio = vdiv_f16(vecB, vecA);
+ vSum = vaddq_f16(vSum, vmulq(vecA, vlogq_f16(vRatio)));
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 8;
+ pSrcB += 8;
+ blkCnt --;
+ }
+
+ accum = vecAddAcrossF16Mve(vSum);
+
+ blkCnt = blockSize & 7;
+ while(blkCnt > 0)
+ {
+ pA = *pSrcA++;
+ pB = *pSrcB++;
+ accum += pA * logf(pB / pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+
+#else
+float16_t arm_kullback_leibler_f16(const float16_t * pSrcA,const float16_t * pSrcB,uint32_t blockSize)
+{
+ const float16_t *pInA, *pInB;
+ uint32_t blkCnt;
+ _Float16 accum, pA,pB;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+ accum += pA * logf(pB / pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Kullback-Leibler group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
new file mode 100644
index 0000000000..8821d984c5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f32.c
@@ -0,0 +1,193 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f32.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup Kullback-Leibler
+ * @{
+ */
+
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * Distribution A may contain 0 with Neon version.
+ * Result will be right but some exception flags will be set.
+ *
+ * Distribution B must not contain 0 probability.
+ *
+ * @param[in] *pSrcA points to an array of input values for probaility distribution A.
+ * @param[in] *pSrcB points to an array of input values for probaility distribution B.
+ * @param[in] blockSize number of samples in the input array.
+ * @return Kullback-Leibler divergence D(A || B)
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t accum, pA,pB;
+
+
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ f32x4_t vecA = vld1q(pSrcA);
+ f32x4_t vecB = vld1q(pSrcB);
+ f32x4_t vRatio;
+
+ vRatio = vdiv_f32(vecB, vecA);
+ vSum = vaddq_f32(vSum, vmulq(vecA, vlogq_f32(vRatio)));
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrcA += 4;
+ pSrcB += 4;
+ blkCnt --;
+ }
+
+ accum = vecAddAcrossF32Mve(vSum);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ pA = *pSrcA++;
+ pB = *pSrcB++;
+ accum += pA * logf(pB / pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+
+float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
+{
+ const float32_t *pInA, *pInB;
+ uint32_t blkCnt;
+ float32_t accum, pA,pB;
+
+ float32x4_t accumV;
+ float32x2_t accumV2;
+ float32x4_t tmpVA, tmpVB,tmpV;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+
+ accum = 0.0f;
+ accumV = vdupq_n_f32(0.0f);
+
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ tmpVA = vld1q_f32(pInA);
+ pInA += 4;
+
+ tmpVB = vld1q_f32(pInB);
+ pInB += 4;
+
+ tmpV = vinvq_f32(tmpVA);
+ tmpVB = vmulq_f32(tmpVB, tmpV);
+ tmpVB = vlogq_f32(tmpVB);
+
+ accumV = vmlaq_f32(accumV, tmpVA, tmpVB);
+
+ blkCnt--;
+
+ }
+
+ accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
+ accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+ accum += pA * logf(pB/pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+
+#else
+float32_t arm_kullback_leibler_f32(const float32_t * pSrcA,const float32_t * pSrcB,uint32_t blockSize)
+{
+ const float32_t *pInA, *pInB;
+ uint32_t blkCnt;
+ float32_t accum, pA,pB;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+ accum += pA * logf(pB / pA);
+
+ blkCnt--;
+
+ }
+
+ return(-accum);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of Kullback-Leibler group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
new file mode 100644
index 0000000000..7fdc5ee28d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_kullback_leibler_f64.c
@@ -0,0 +1,75 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f64.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+/**
+ * @addtogroup Kullback-Leibler
+ * @{
+ */
+
+/**
+ * @brief Kullback-Leibler
+ *
+ * @param[in] *pSrcA points to an array of input values for probaility distribution A.
+ * @param[in] *pSrcB points to an array of input values for probaility distribution B.
+ * @param[in] blockSize number of samples in the input array.
+ * @return Kullback-Leibler divergence D(A || B)
+ *
+ */
+
+float64_t arm_kullback_leibler_f64(const float64_t * pSrcA, const float64_t * pSrcB, uint32_t blockSize)
+{
+ const float64_t *pInA, *pInB;
+ uint32_t blkCnt;
+ float64_t accum, pA,pB;
+
+ pInA = pSrcA;
+ pInB = pSrcB;
+ blkCnt = blockSize;
+
+ accum = 0.0f;
+
+ while(blkCnt > 0)
+ {
+ pA = *pInA++;
+ pB = *pInB++;
+
+ accum += pA * log(pB / pA);
+
+ blkCnt--;
+ }
+
+ return(-accum);
+}
+
+/**
+ * @} end of Kullback-Leibler group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c
new file mode 100644
index 0000000000..42dcdab9fb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f16.c
@@ -0,0 +1,84 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f16.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup LogSumExp LogSumExp
+
+ LogSumExp optimizations to compute sum of probabilities with Gaussian distributions
+
+ */
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in] *pSrcA points to the first input vector
+ * @param[in] *pSrcB points to the second input vector
+ * @param[in] blockSize number of samples in each vector
+ * @param[in] *pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product.
+ *
+ */
+
+
+float16_t arm_logsumexp_dot_prod_f16(const float16_t * pSrcA,
+ const float16_t * pSrcB,
+ uint32_t blockSize,
+ float16_t *pTmpBuffer)
+{
+ float16_t result;
+ arm_add_f16((float16_t*)pSrcA, (float16_t*)pSrcB, pTmpBuffer, blockSize);
+
+ result = arm_logsumexp_f16(pTmpBuffer, blockSize);
+ return(result);
+}
+
+/**
+ * @} end of LogSumExp group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c
new file mode 100644
index 0000000000..d2a94dacd1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_dot_prod_f32.c
@@ -0,0 +1,68 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f32.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Dot product with log arithmetic
+ *
+ * Vectors are containing the log of the samples
+ *
+ * @param[in] *pSrcA points to the first input vector
+ * @param[in] *pSrcB points to the second input vector
+ * @param[in] blockSize number of samples in each vector
+ * @param[in] *pTmpBuffer temporary buffer of length blockSize
+ * @return The log of the dot product.
+ *
+ */
+
+
+float32_t arm_logsumexp_dot_prod_f32(const float32_t * pSrcA,
+ const float32_t * pSrcB,
+ uint32_t blockSize,
+ float32_t *pTmpBuffer)
+{
+ float32_t result;
+ arm_add_f32((float32_t*)pSrcA, (float32_t*)pSrcB, pTmpBuffer, blockSize);
+
+ result = arm_logsumexp_f32(pTmpBuffer, blockSize);
+ return(result);
+}
+
+/**
+ * @} end of LogSumExp group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c
new file mode 100644
index 0000000000..5fdbb12c63
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f16.c
@@ -0,0 +1,172 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f16.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Computation of the LogSumExp
+ *
+ * In probabilistic computations, the dynamic of the probability values can be very
+ * wide because they come from gaussian functions.
+ * To avoid underflow and overflow issues, the values are represented by their log.
+ * In this representation, multiplying the original exp values is easy : their logs are added.
+ * But adding the original exp values is requiring some special handling and it is the
+ * goal of the LogSumExp function.
+ *
+ * If the values are x1...xn, the function is computing:
+ *
+ * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
+ * rounding issues are minimised.
+ *
+ * The max xm of the values is extracted and the function is computing:
+ * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
+ *
+ * @param[in] *in Pointer to an array of input values.
+ * @param[in] blockSize Number of samples in the input array.
+ * @return LogSumExp
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math_f16.h"
+
+float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
+{
+ float16_t maxVal;
+ const float16_t *pIn;
+ int32_t blkCnt;
+ _Float16 accum=0.0f16;
+ _Float16 tmp;
+
+
+ arm_max_no_idx_f16((float16_t *) in, blockSize, &maxVal);
+
+
+ blkCnt = blockSize;
+ pIn = in;
+
+
+ f16x8_t vSum = vdupq_n_f16(0.0f16);
+ blkCnt = blockSize >> 3;
+ while(blkCnt > 0)
+ {
+ f16x8_t vecIn = vld1q(pIn);
+ f16x8_t vecExp;
+
+ vecExp = vexpq_f16(vsubq_n_f16(vecIn, maxVal));
+
+ vSum = vaddq_f16(vSum, vecExp);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pIn += 8;
+ blkCnt --;
+ }
+
+ /* sum + log */
+ accum = vecAddAcrossF16Mve(vSum);
+
+ blkCnt = blockSize & 0x7;
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+ accum += expf(tmp - maxVal);
+ blkCnt--;
+
+ }
+
+ accum = maxVal + logf(accum);
+
+ return (accum);
+}
+
+#else
+float16_t arm_logsumexp_f16(const float16_t *in, uint32_t blockSize)
+{
+ _Float16 maxVal;
+ _Float16 tmp;
+ const float16_t *pIn;
+ uint32_t blkCnt;
+ _Float16 accum;
+
+ pIn = in;
+ blkCnt = blockSize;
+
+ maxVal = *pIn++;
+ blkCnt--;
+
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+
+ if (tmp > maxVal)
+ {
+ maxVal = tmp;
+ }
+ blkCnt--;
+
+ }
+
+ blkCnt = blockSize;
+ pIn = in;
+ accum = 0;
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+ accum += expf(tmp - maxVal);
+ blkCnt--;
+
+ }
+ accum = maxVal + logf(accum);
+
+ return(accum);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of LogSumExp group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c
new file mode 100644
index 0000000000..e770fa79f1
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_logsumexp_f32.c
@@ -0,0 +1,277 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_logsumexp_f32.c
+ * Description: LogSumExp
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#include
+#include
+
+
+/**
+ * @addtogroup LogSumExp
+ * @{
+ */
+
+
+/**
+ * @brief Computation of the LogSumExp
+ *
+ * In probabilistic computations, the dynamic of the probability values can be very
+ * wide because they come from gaussian functions.
+ * To avoid underflow and overflow issues, the values are represented by their log.
+ * In this representation, multiplying the original exp values is easy : their logs are added.
+ * But adding the original exp values is requiring some special handling and it is the
+ * goal of the LogSumExp function.
+ *
+ * If the values are x1...xn, the function is computing:
+ *
+ * ln(exp(x1) + ... + exp(xn)) and the computation is done in such a way that
+ * rounding issues are minimised.
+ *
+ * The max xm of the values is extracted and the function is computing:
+ * xm + ln(exp(x1 - xm) + ... + exp(xn - xm))
+ *
+ * @param[in] *in Pointer to an array of input values.
+ * @param[in] blockSize Number of samples in the input array.
+ * @return LogSumExp
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_math.h"
+
+float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
+{
+ float32_t maxVal;
+ const float32_t *pIn;
+ int32_t blkCnt;
+ float32_t accum=0.0f;
+ float32_t tmp;
+
+
+ arm_max_no_idx_f32((float32_t *) in, blockSize, &maxVal);
+
+
+ blkCnt = blockSize;
+ pIn = in;
+
+
+ f32x4_t vSum = vdupq_n_f32(0.0f);
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ f32x4_t vecIn = vld1q(pIn);
+ f32x4_t vecExp;
+
+ vecExp = vexpq_f32(vsubq_n_f32(vecIn, maxVal));
+
+ vSum = vaddq_f32(vSum, vecExp);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pIn += 4;
+ blkCnt --;
+ }
+
+ /* sum + log */
+ accum = vecAddAcrossF32Mve(vSum);
+
+ blkCnt = blockSize & 0x3;
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+ accum += expf(tmp - maxVal);
+ blkCnt--;
+
+ }
+
+ accum = maxVal + log(accum);
+
+ return (accum);
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "NEMath.h"
+float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
+{
+ float32_t maxVal;
+ float32_t tmp;
+ float32x4_t tmpV, tmpVb;
+ float32x4_t maxValV;
+ uint32x4_t idxV;
+ float32x4_t accumV;
+ float32x2_t accumV2;
+
+ const float32_t *pIn;
+ uint32_t blkCnt;
+ float32_t accum;
+
+ pIn = in;
+
+ blkCnt = blockSize;
+
+ if (blockSize <= 3)
+ {
+ maxVal = *pIn++;
+ blkCnt--;
+
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+
+ if (tmp > maxVal)
+ {
+ maxVal = tmp;
+ }
+ blkCnt--;
+ }
+ }
+ else
+ {
+ maxValV = vld1q_f32(pIn);
+ pIn += 4;
+ blkCnt = (blockSize - 4) >> 2;
+
+ while(blkCnt > 0)
+ {
+ tmpVb = vld1q_f32(pIn);
+ pIn += 4;
+
+ idxV = vcgtq_f32(tmpVb, maxValV);
+ maxValV = vbslq_f32(idxV, tmpVb, maxValV );
+
+ blkCnt--;
+ }
+
+ accumV2 = vpmax_f32(vget_low_f32(maxValV),vget_high_f32(maxValV));
+ accumV2 = vpmax_f32(accumV2,accumV2);
+ maxVal = vget_lane_f32(accumV2, 0) ;
+
+ blkCnt = (blockSize - 4) & 3;
+
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+
+ if (tmp > maxVal)
+ {
+ maxVal = tmp;
+ }
+ blkCnt--;
+ }
+
+ }
+
+
+
+ maxValV = vdupq_n_f32(maxVal);
+ pIn = in;
+ accum = 0;
+ accumV = vdupq_n_f32(0.0f);
+
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0)
+ {
+ tmpV = vld1q_f32(pIn);
+ pIn += 4;
+ tmpV = vsubq_f32(tmpV, maxValV);
+ tmpV = vexpq_f32(tmpV);
+ accumV = vaddq_f32(accumV, tmpV);
+
+ blkCnt--;
+
+ }
+ accumV2 = vpadd_f32(vget_low_f32(accumV),vget_high_f32(accumV));
+ accum = vget_lane_f32(accumV2, 0) + vget_lane_f32(accumV2, 1);
+
+ blkCnt = blockSize & 0x3;
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+ accum += expf(tmp - maxVal);
+ blkCnt--;
+
+ }
+
+ accum = maxVal + logf(accum);
+
+ return(accum);
+}
+#else
+float32_t arm_logsumexp_f32(const float32_t *in, uint32_t blockSize)
+{
+ float32_t maxVal;
+ float32_t tmp;
+ const float32_t *pIn;
+ uint32_t blkCnt;
+ float32_t accum;
+
+ pIn = in;
+ blkCnt = blockSize;
+
+ maxVal = *pIn++;
+ blkCnt--;
+
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+
+ if (tmp > maxVal)
+ {
+ maxVal = tmp;
+ }
+ blkCnt--;
+
+ }
+
+ blkCnt = blockSize;
+ pIn = in;
+ accum = 0;
+ while(blkCnt > 0)
+ {
+ tmp = *pIn++;
+ accum += expf(tmp - maxVal);
+ blkCnt--;
+
+ }
+ accum = maxVal + logf(accum);
+
+ return(accum);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of LogSumExp group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c
new file mode 100644
index 0000000000..4522392893
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f16.c
@@ -0,0 +1,246 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_f16.c
+ * Description: Maximum value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_max_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt;
+ f16x8_t vecSrc;
+ f16x8_t curExtremValVec = vdupq_n_f16(F16_MIN);
+ float16_t maxValue = F16_MIN;
+ uint32_t idx = blockSize;
+ uint16x8_t indexVec;
+ uint16x8_t curExtremIdxVec;
+ uint32_t curIdx = 0;
+ mve_pred16_t p0;
+ float16_t tmp;
+
+
+ indexVec = vidupq_wb_u16(&curIdx, 1);
+ curExtremIdxVec = vdupq_n_u16(0);
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ vecSrc = vldrhq_f16(pSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = vidupq_wb_u16(&curIdx, 1);
+
+ pSrc += 8;
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpgeq(curExtremValVec, maxValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+
+ while (blkCnt > 0)
+ {
+ /* Initialize tmp to the next consecutive values one by one */
+ tmp = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < tmp)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = tmp;
+ idx = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = maxValue;
+}
+
+#else
+void arm_max_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 2U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 3U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Max group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c
new file mode 100644
index 0000000000..9f9c94d559
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_f32.c
@@ -0,0 +1,365 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_f32.c
+ * Description: Maximum value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup Max Maximum
+
+ Computes the maximum value of an array of data.
+ The function returns both the maximum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_max_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ uint32_t blkCnt;
+ f32x4_t vecSrc;
+ f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
+ float32_t maxValue = F32_MIN;
+ uint32_t idx = blockSize;
+ uint32x4_t indexVec;
+ uint32x4_t curExtremIdxVec;
+ uint32_t curIdx = 0;
+ mve_pred16_t p0;
+ float32_t tmp;
+
+
+ indexVec = vidupq_wb_u32(&curIdx, 1);
+ curExtremIdxVec = vdupq_n_u32(0);
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_f32(pSrc);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = vidupq_wb_u32(&curIdx, 1);
+
+ pSrc += 4;
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpgeq(curExtremValVec, maxValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize tmp to the next consecutive values one by one */
+ tmp = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < tmp)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = tmp;
+ idx = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = maxValue;
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_max_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ uint32x4_t maxIdx;
+ static const uint32_t indexInit[4]={4,5,6,7};
+ static const uint32_t countVInit[4]={0,1,2,3};
+
+ uint32x4_t index;
+ uint32x4_t delta;
+ uint32x4_t countV;
+ uint32x2_t countV2;
+
+ maxIdx = vdupq_n_u32(ULONG_MAX);
+ delta = vdupq_n_u32(4);
+ index = vld1q_u32(indexInit);
+ countV = vld1q_u32(countVInit);
+
+
+ /* Initialise the index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcgtq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmax_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmax_f32(outV2,outV2);
+ out = vget_lane_f32(outV2, 0);
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = vget_lane_u32(countV2,0);
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_max_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 2U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 3U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Max group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c
new file mode 100644
index 0000000000..6faddbd7aa
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f16.c
@@ -0,0 +1,144 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_no_idx_f16.c
+ * Description: Maximum value of a floating-point vector without returning the index
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_max_no_idx_f16(
+ const float16_t *pSrc,
+ uint32_t blockSize,
+ float16_t *pResult)
+{
+ f16x8_t vecSrc;
+ f16x8_t curExtremValVec = vdupq_n_f16(F16_MIN);
+ float16_t maxValue = F16_MIN;
+ float16_t newVal;
+ uint32_t blkCnt;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 3U;
+
+ while (blkCnt > 0U)
+ {
+
+ vecSrc = vldrhq_f16(pSrc);
+ /*
+ * update per-lane max.
+ */
+ curExtremValVec = vmaxnmq(vecSrc, curExtremValVec);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ blkCnt --;
+ }
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+
+ blkCnt = blockSize & 7;
+
+ while (blkCnt > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < newVal)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = newVal;
+ }
+
+ blkCnt --;
+ }
+
+ *pResult = maxValue;
+}
+
+#else
+
+void arm_max_no_idx_f16(
+ const float16_t *pSrc,
+ uint32_t blockSize,
+ float16_t *pResult)
+{
+ float16_t maxValue = F16_MIN;
+ float16_t newVal;
+
+ while (blockSize > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < newVal)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = newVal;
+ }
+
+ blockSize --;
+ }
+
+ *pResult = maxValue;
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Max group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c
new file mode 100644
index 0000000000..e4aef9653b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_no_idx_f32.c
@@ -0,0 +1,138 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_no_idx_f32.c
+ * Description: Maximum value of a floating-point vector without returning the index
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_max_no_idx_f32(
+ const float32_t *pSrc,
+ uint32_t blockSize,
+ float32_t *pResult)
+{
+ f32x4_t vecSrc;
+ f32x4_t curExtremValVec = vdupq_n_f32(F32_MIN);
+ float32_t maxValue = F32_MIN;
+ float32_t newVal;
+ uint32_t blkCnt;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+
+ vecSrc = vldrwq_f32(pSrc);
+ /*
+ * update per-lane max.
+ */
+ curExtremValVec = vmaxnmq(vecSrc, curExtremValVec);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ blkCnt --;
+ }
+ /*
+ * Get max value across the vector
+ */
+ maxValue = vmaxnmvq(maxValue, curExtremValVec);
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < newVal)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = newVal;
+ }
+
+ blkCnt --;
+ }
+
+ *pResult = maxValue;
+}
+
+#else
+
+void arm_max_no_idx_f32(
+ const float32_t *pSrc,
+ uint32_t blockSize,
+ float32_t *pResult)
+{
+ float32_t maxValue = F32_MIN;
+ float32_t newVal;
+
+ while (blockSize > 0U)
+ {
+ newVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (maxValue < newVal)
+ {
+ /* Update the maximum value and it's index */
+ maxValue = newVal;
+ }
+
+ blockSize --;
+ }
+
+ *pResult = maxValue;
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Max group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c
new file mode 100644
index 0000000000..5715e37bde
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q15.c
@@ -0,0 +1,201 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_q15.c
+ * Description: Maximum value of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_max_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q15x8_t extremValVec = vdupq_n_s16(Q15_MIN);
+ q15_t maxValue = Q15_MIN;
+ uint16x8_t indexVec;
+ uint16x8_t extremIdxVec;
+ mve_pred16_t p0;
+ uint16_t extremIdxArr[8];
+
+ indexVec = vidupq_u16(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+ q15x8_t extremIdxVal = vld1q_z_s16(pSrc, p);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+ indexVec += 8;
+ pSrc += 8;
+ blkCnt -= 8;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u16(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+
+#else
+void arm_max_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 2U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 3U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+/**
+ @} end of Max group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c
new file mode 100644
index 0000000000..fed900b8f5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q31.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_q31.c
+ * Description: Maximum value of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_max_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q31x4_t extremValVec = vdupq_n_s32(Q31_MIN);
+ q31_t maxValue = Q31_MIN;
+ uint32x4_t indexVec;
+ uint32x4_t extremIdxVec;
+ mve_pred16_t p0;
+ uint32_t extremIdxArr[4];
+
+ indexVec = vidupq_u32(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+ q31x4_t extremIdxVal = vld1q_z_s32(pSrc, p);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+ indexVec += 4;
+ pSrc += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u32(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+
+#else
+void arm_max_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 2U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 3U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Max group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
new file mode 100644
index 0000000000..5deae648a3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_max_q7.c
@@ -0,0 +1,256 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_max_q7.c
+ * Description: Maximum value of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup Max
+ @{
+ */
+
+/**
+ @brief Maximum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult maximum value returned here
+ @param[out] pIndex index of maximum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+static void arm_small_blk_max_q7(
+ const q7_t * pSrc,
+ uint16_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q7x16_t extremValVec = vdupq_n_s8(Q7_MIN);
+ q7_t maxValue = Q7_MIN;
+ uint8x16_t indexVec;
+ uint8x16_t extremIdxVec;
+ mve_pred16_t p0;
+ uint8_t extremIdxArr[16];
+
+ indexVec = vidupq_u8(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp8q(blkCnt);
+ q7x16_t extremIdxVal = vld1q_z_s8(pSrc, p);
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpgeq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u8(extremIdxArr, indexVec, p0);
+
+ indexVec += 16;
+ pSrc += 16;
+ blkCnt -= 16;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get max value across the vector */
+ maxValue = vmaxvq(maxValue, extremValVec);
+
+ /* set index for lower values to max possible index */
+ p0 = vcmpgeq(extremValVec, maxValue);
+ extremIdxVec = vld1q_u8(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u8(blockSize - 1), p0);
+ *pIndex = vminvq_u8(blockSize - 1, indexVec);
+ *pResult = maxValue;
+}
+
+void arm_max_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t totalSize = blockSize;
+ const uint16_t sub_blk_sz = UINT8_MAX + 1;
+
+ if (totalSize <= sub_blk_sz)
+ {
+ arm_small_blk_max_q7(pSrc, blockSize, pResult, pIndex);
+ }
+ else
+ {
+ uint32_t curIdx = 0;
+ q7_t curBlkExtr = Q7_MIN;
+ uint32_t curBlkPos = 0;
+ uint32_t curBlkIdx = 0;
+ /*
+ * process blocks of 255 elts
+ */
+ while (totalSize >= sub_blk_sz)
+ {
+ const q7_t *curSrc = pSrc;
+
+ arm_small_blk_max_q7(curSrc, sub_blk_sz, pResult, pIndex);
+ if (*pResult > curBlkExtr)
+ {
+ /*
+ * update partial extrema
+ */
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ curIdx++;
+ pSrc += sub_blk_sz;
+ totalSize -= sub_blk_sz;
+ }
+ /*
+ * remainder
+ */
+ arm_small_blk_max_q7(pSrc, totalSize, pResult, pIndex);
+ if (*pResult > curBlkExtr)
+ {
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ *pIndex = curBlkIdx * sub_blk_sz + curBlkPos;
+ *pResult = curBlkExtr;
+ }
+}
+#else
+void arm_max_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t maxVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = index + 1U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 2U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 3U;
+ }
+
+ maxVal = *pSrc++;
+ if (out < maxVal)
+ {
+ out = maxVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out < maxVal)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Max group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c
new file mode 100644
index 0000000000..a392885a41
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mean_f16.c
+ * Description: Mean value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup mean Mean
+
+ Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+ The underlying algorithm is used:
+
+
+ Result = (pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]) / blockSize;
+
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup mean
+ @{
+ */
+
+/**
+ @brief Mean value of a floating-point vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult mean value returned here.
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mean_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ int32_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ f16x8_t sumVec = vdupq_n_f16(0.0f16);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+
+ vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+ sumVec = vaddq_m_f16(sumVec, sumVec, vecSrc, p);
+
+ blkCnt -= 8;
+ pSrc += 8;
+ }
+ while (blkCnt > 0);
+
+ *pResult = vecAddAcrossF16Mve(sumVec) / (float16_t) blockSize;
+}
+
+
+#else
+
+void arm_mean_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float16_t sum = 0.0f; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (sum / (float16_t)blockSize);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of mean group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c
new file mode 100644
index 0000000000..6dd7a846f9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_f32.c
@@ -0,0 +1,198 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mean_f32.c
+ * Description: Mean value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup mean
+ @{
+ */
+
+/**
+ @brief Mean value of a floating-point vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult mean value returned here.
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_mean_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ f32x4_t vecSrc;
+ f32x4_t sumVec = vdupq_n_f32(0.0f);
+ float32_t sum = 0.0f;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_f32(pSrc);
+ sumVec = vaddq_f32(sumVec, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+
+ sum = vecAddAcrossF32Mve(sumVec);
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ *pResult = sum / (float32_t) blockSize;
+}
+
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_mean_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+
+ uint32_t blkCnt; /* Loop counter */
+
+ float32x4_t inV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ inV = vld1q_f32(pSrc);
+ sumV = vaddq_f32(sumV, inV);
+
+ pSrc += 4;
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = sum / (float32_t) blockSize;
+}
+#else
+void arm_mean_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (sum / blockSize);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of mean group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c
new file mode 100644
index 0000000000..f8af0edad8
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q15.c
@@ -0,0 +1,156 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mean_q15.c
+ * Description: Mean value of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup mean
+ @{
+ */
+
+/**
+ @brief Mean value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.15 format and is accumulated in a 32-bit
+ accumulator in 17.15 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.15 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_mean_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecSrc;
+ q31_t sum = 0L;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_s16(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vaddvaq(sum, vecSrc);
+
+ blkCnt--;
+ pSrc += 8;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x7;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = (q15_t) (sum / (int32_t) blockSize);
+}
+#else
+void arm_mean_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ sum += ((in << 16U) >> 16U);
+ sum += (in >> 16U);
+
+ in = read_q15x2_ia ((q15_t **) &pSrc);
+ sum += ((in << 16U) >> 16U);
+ sum += (in >> 16U);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (q15_t) (sum / (int32_t) blockSize);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of mean group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c
new file mode 100644
index 0000000000..b33ed00c5e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q31.c
@@ -0,0 +1,149 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mean_q31.c
+ * Description: Mean value of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup mean
+ @{
+ */
+
+/**
+ @brief Mean value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format and is accumulated in a 64-bit
+ accumulator in 33.31 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.31 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_mean_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecSrc;
+ q63_t sum = 0LL;
+
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+
+ vecSrc = vldrwq_s32(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vaddlvaq(sum, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 0x3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+ blkCnt --;
+ }
+
+ *pResult = arm_div_q63_to_q31(sum, blockSize);
+}
+#else
+void arm_mean_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (q31_t) (sum / blockSize);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of mean group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c
new file mode 100644
index 0000000000..b598853c14
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_mean_q7.c
@@ -0,0 +1,153 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_mean_q7.c
+ * Description: Mean value of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup mean
+ @{
+ */
+
+/**
+ @brief Mean value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult mean value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format and is accumulated in a 32-bit
+ accumulator in 25.7 format.
+ There is no risk of internal overflow with this approach, and the
+ full precision of intermediate result is preserved.
+ Finally, the accumulator is truncated to yield a result of 1.7 format.
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_mean_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q7x16_t vecSrc;
+ q31_t sum = 0L;
+
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrbq_s8(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vaddvaq(sum, vecSrc);
+
+ blkCnt--;
+ pSrc += 16;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = (q7_t) (sum / (int32_t) blockSize);
+}
+#else
+void arm_mean_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in = read_q7x4_ia ((q7_t **) &pSrc);
+ sum += ((in << 24U) >> 24U);
+ sum += ((in << 16U) >> 24U);
+ sum += ((in << 8U) >> 24U);
+ sum += (in >> 24U);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store result to destination */
+ *pResult = (q7_t) (sum / (int32_t) blockSize);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of mean group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c
new file mode 100644
index 0000000000..d29318ef65
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f16.c
@@ -0,0 +1,240 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_min_f16.c
+ * Description: Minimum value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup Min
+ @{
+ */
+
+/**
+ @brief Minimum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_min_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ float16_t const *pSrcVec;
+ f16x8_t curExtremValVec = vdupq_n_f16(F16_MAX);
+ float16_t minValue = F16_MAX;
+ uint32_t idx = blockSize;
+ uint16x8_t indexVec;
+ uint16x8_t curExtremIdxVec;
+ mve_pred16_t p0;
+
+ indexVec = vidupq_u16((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u16(0);
+
+ pSrcVec = (float16_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ vecSrc = vldrhq_f16(pSrcVec); pSrcVec += 8;
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0)
+ {
+ vecSrc = vldrhq_f16(pSrcVec); pSrcVec += 8;
+ p0 = vctp16q(blkCnt);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(vecSrc, curExtremValVec, p0);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+ }
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminnmvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to min possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u16(blockSize), p0);
+ /*
+ * Get min index which is thus for a min value
+ */
+ idx = vminvq(idx, indexVec);
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+
+void arm_min_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult,
+ uint32_t * pIndex)
+{
+ float16_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 2U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 3U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Min group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c
new file mode 100644
index 0000000000..53e4f8570d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_f32.c
@@ -0,0 +1,363 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_min_f32.c
+ * Description: Minimum value of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+#if (defined(ARM_MATH_NEON) || defined(ARM_MATH_MVEF)) && !defined(ARM_MATH_AUTOVECTORIZE)
+#include
+#endif
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup Min Minimum
+
+ Computes the minimum value of an array of data.
+ The function returns both the minimum value and its position within the array.
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Min
+ @{
+ */
+
+/**
+ @brief Minimum value of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_min_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ uint32_t blkCnt; /* loop counters */
+ f32x4_t vecSrc;
+ float32_t const *pSrcVec;
+ f32x4_t curExtremValVec = vdupq_n_f32(F32_MAX);
+ float32_t minValue = F32_MAX;
+ uint32_t idx = blockSize;
+ uint32x4_t indexVec;
+ uint32x4_t curExtremIdxVec;
+ float32_t tmp;
+ mve_pred16_t p0;
+
+ indexVec = vidupq_u32((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u32(0);
+
+ pSrcVec = (float32_t const *) pSrc;
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_f32(pSrcVec);
+ pSrcVec += 4;
+ /*
+ * Get current max per lane and current index per lane
+ * when a max is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminnmvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to max possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u32(blockSize), p0);
+ /*
+ * Get min index which is thus for a max value
+ */
+ idx = vminvq(idx, indexVec);
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x3;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ tmp = *pSrc++;
+
+ /* compare for the minimum value */
+ if (minValue > tmp)
+ {
+ /* Update the minimum value and it's index */
+ minValue = tmp;
+ idx = blockSize - blkCnt;
+ }
+ blkCnt--;
+ }
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_min_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ float32x4_t outV, srcV;
+ float32x2_t outV2;
+
+ uint32x4_t idxV;
+ static const uint32_t indexInit[4]={4,5,6,7};
+ static const uint32_t countVInit[4]={0,1,2,3};
+ uint32x4_t maxIdx;
+ uint32x4_t index;
+ uint32x4_t delta;
+ uint32x4_t countV;
+ uint32x2_t countV2;
+
+ maxIdx = vdupq_n_u32(ULONG_MAX);
+ delta = vdupq_n_u32(4);
+ index = vld1q_u32(indexInit);
+ countV = vld1q_u32(countVInit);
+
+ /* Initialise the index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparison */
+ if (blockSize <= 3)
+ {
+ out = *pSrc++;
+
+ blkCnt = blockSize - 1;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ outV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = (blockSize - 4 ) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ srcV = vld1q_f32(pSrc);
+ pSrc += 4;
+
+ idxV = vcltq_f32(srcV, outV);
+ outV = vbslq_f32(idxV, srcV, outV );
+ countV = vbslq_u32(idxV, index,countV );
+
+ index = vaddq_u32(index,delta);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ outV2 = vpmin_f32(vget_low_f32(outV),vget_high_f32(outV));
+ outV2 = vpmin_f32(outV2,outV2);
+ out = vget_lane_f32(outV2,0);
+
+ idxV = vceqq_f32(outV, vdupq_n_f32(out));
+ countV = vbslq_u32(idxV, countV,maxIdx);
+
+ countV2 = vpmin_u32(vget_low_u32(countV),vget_high_u32(countV));
+ countV2 = vpmin_u32(countV2,countV2);
+ outIndex = vget_lane_u32(countV2,0);
+
+ /* if (blockSize - 1U) is not multiple of 4 */
+ blkCnt = (blockSize - 4 ) % 4U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if (out > maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt ;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#else
+void arm_min_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+ float32_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 2U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 3U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Min group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c
new file mode 100644
index 0000000000..ad730661c2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q15.c
@@ -0,0 +1,203 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_min_q15.c
+ * Description: Minimum value of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Min
+ @{
+ */
+
+/**
+ @brief Minimum value of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_min_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+
+ int32_t blkCnt; /* loop counters */
+ q15x8_t extremValVec = vdupq_n_s16(Q15_MAX);
+ q15_t minValue = Q15_MAX;
+ uint16x8_t indexVec;
+ uint16x8_t extremIdxVec;
+ mve_pred16_t p0;
+ uint16_t extremIdxArr[8];
+
+ indexVec = vidupq_u16(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+ q15x8_t extremIdxVal = vld1q_z_s16(pSrc, p);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u16(extremIdxArr, indexVec, p0);
+
+ indexVec += 8;
+ pSrc += 8;
+ blkCnt -= 8;
+ }
+ while (blkCnt > 0);
+
+ /* Get min value across the vector */
+ minValue = vminvq(minValue, extremValVec);
+
+ /* set index for lower values to min possible index */
+ p0 = vcmpleq(extremValVec, minValue);
+ extremIdxVec = vld1q_u16(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u16(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = minValue;
+
+}
+#else
+void arm_min_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+ q15_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 2U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 3U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Min group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c
new file mode 100644
index 0000000000..c993004cbe
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q31.c
@@ -0,0 +1,203 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_min_q31.c
+ * Description: Minimum value of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Min
+ @{
+ */
+
+/**
+ @brief Minimum value of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_min_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t blkCnt; /* loop counters */
+ q31x4_t extremValVec = vdupq_n_s32(Q31_MAX);
+ q31_t minValue = Q31_MAX;
+ uint32x4_t indexVec;
+ uint32x4_t extremIdxVec;
+ mve_pred16_t p0;
+ uint32_t extremIdxArr[4];
+
+ indexVec = vidupq_u32(0U, 1);
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp32q(blkCnt);
+ q31x4_t extremIdxVal = vld1q_z_s32(pSrc, p);
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq_m(extremIdxVal, extremValVec, p);
+
+ extremValVec = vorrq_m(extremValVec, extremIdxVal, extremIdxVal, p0);
+ /* store per-lane extrema indexes */
+ vst1q_p_u32(extremIdxArr, indexVec, p0);
+
+ indexVec += 4;
+ pSrc += 4;
+ blkCnt -= 4;
+ }
+ while (blkCnt > 0);
+
+
+ /* Get min value across the vector */
+ minValue = vminvq(minValue, extremValVec);
+
+ /* set index for lower values to min possible index */
+ p0 = vcmpleq(extremValVec, minValue);
+ extremIdxVec = vld1q_u32(extremIdxArr);
+
+ indexVec = vpselq(extremIdxVec, vdupq_n_u32(blockSize - 1), p0);
+ *pIndex = vminvq(blockSize - 1, indexVec);
+ *pResult = minValue;
+}
+
+#else
+void arm_min_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+ q31_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 2U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 3U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Min group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c
new file mode 100644
index 0000000000..990693d586
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_min_q7.c
@@ -0,0 +1,284 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_min_q7.c
+ * Description: Minimum value of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup Min
+ @{
+ */
+
+/**
+ @brief Minimum value of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult minimum value returned here
+ @param[out] pIndex index of minimum value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+static void arm_small_blk_min_q7(
+ const q7_t * pSrc,
+ uint8_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ uint32_t blkCnt; /* loop counters */
+ q7x16_t vecSrc;
+ q7x16_t curExtremValVec = vdupq_n_s8(Q7_MAX);
+ q7_t minValue = Q7_MAX,temp;
+ uint32_t idx = blockSize;
+ uint8x16_t indexVec;
+ uint8x16_t curExtremIdxVec;
+ mve_pred16_t p0;
+
+
+ indexVec = vidupq_u8((uint32_t)0, 1);
+ curExtremIdxVec = vdupq_n_u8(0);
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrbq_s8(pSrc);
+ pSrc += 16;
+ /*
+ * Get current min per lane and current index per lane
+ * when a min is selected
+ */
+ p0 = vcmpleq(vecSrc, curExtremValVec);
+ curExtremValVec = vpselq(vecSrc, curExtremValVec, p0);
+ curExtremIdxVec = vpselq(indexVec, curExtremIdxVec, p0);
+
+ indexVec = indexVec + 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * Get min value across the vector
+ */
+ minValue = vminvq(minValue, curExtremValVec);
+ /*
+ * set index for lower values to min possible index
+ */
+ p0 = vcmpleq(curExtremValVec, minValue);
+ indexVec = vpselq(curExtremIdxVec, vdupq_n_u8(blockSize), p0);
+ /*
+ * Get min index which is thus for a min value
+ */
+ idx = vminvq(idx, indexVec);
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ temp = *pSrc++;
+
+ /* compare for the minimum value */
+ if (minValue > temp)
+ {
+ /* Update the minimum value and it's index */
+ minValue = temp;
+ idx = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+ /*
+ * Save result
+ */
+ *pIndex = idx;
+ *pResult = minValue;
+}
+
+void arm_min_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ int32_t totalSize = blockSize;
+
+ if (totalSize <= UINT8_MAX)
+ {
+ arm_small_blk_min_q7(pSrc, blockSize, pResult, pIndex);
+ }
+ else
+ {
+ uint32_t curIdx = 0;
+ q7_t curBlkExtr = Q7_MAX;
+ uint32_t curBlkPos = 0;
+ uint32_t curBlkIdx = 0;
+ /*
+ * process blocks of 255 elts
+ */
+ while (totalSize >= UINT8_MAX)
+ {
+ const q7_t *curSrc = pSrc;
+
+ arm_small_blk_min_q7(curSrc, UINT8_MAX, pResult, pIndex);
+ if (*pResult < curBlkExtr)
+ {
+ /*
+ * update partial extrema
+ */
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ curIdx++;
+ pSrc += UINT8_MAX;
+ totalSize -= UINT8_MAX;
+ }
+ /*
+ * remainder
+ */
+ arm_small_blk_min_q7(pSrc, totalSize, pResult, pIndex);
+ if (*pResult < curBlkExtr)
+ {
+ curBlkExtr = *pResult;
+ curBlkPos = *pIndex;
+ curBlkIdx = curIdx;
+ }
+ *pIndex = curBlkIdx * UINT8_MAX + curBlkPos;
+ *pResult = curBlkExtr;
+ }
+}
+#else
+void arm_min_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+ q7_t minVal, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ uint32_t index; /* index of maximum value */
+#endif
+
+ /* Initialise index value to zero. */
+ outIndex = 0U;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ /* Initialise index of maximum value. */
+ index = 0U;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = (blockSize - 1U) >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = index + 1U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 2U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 3U;
+ }
+
+ minVal = *pSrc++;
+ if (out > minVal)
+ {
+ out = minVal;
+ outIndex = index + 4U;
+ }
+
+ index += 4U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = (blockSize - 1U) % 4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (blockSize - 1U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal = *pSrc++;
+
+ /* compare for the minimum value */
+ if (out > minVal)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Min group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c
new file mode 100644
index 0000000000..27e25a3798
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f16.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_power_f16.c
+ * Description: Sum of the squares of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupStats
+ */
+
+
+
+/**
+ @addtogroup power
+ @{
+ */
+
+/**
+ @brief Sum of the squares of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_power_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ int32_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ f16x8_t sumVec = vdupq_n_f16(0.0f);
+
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+
+ vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+ /*
+ * sum lanes
+ */
+ sumVec = vfmaq_m(sumVec, vecSrc, vecSrc, p);
+
+ blkCnt -= 8;
+ pSrc += 8;
+ }
+ while (blkCnt > 0);
+
+ *pResult = vecAddAcrossF16Mve(sumVec);
+}
+#else
+
+void arm_power_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ _Float16 sum = 0.0f16; /* Temporary result storage */
+ _Float16 in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of power group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c
new file mode 100644
index 0000000000..82fabbaf97
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_f32.c
@@ -0,0 +1,229 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_power_f32.c
+ * Description: Sum of the squares of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup power Power
+
+ Calculates the sum of the squares of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+ Result = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + pSrc[2] * pSrc[2] + ... + pSrc[blockSize-1] * pSrc[blockSize-1];
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
+
+ Since the result is not divided by the length, those functions are in fact computing
+ something which is more an energy than a power.
+
+ */
+
+/**
+ @addtogroup power
+ @{
+ */
+
+/**
+ @brief Sum of the squares of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_power_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ f32x4_t vecSrc;
+ f32x4_t sumVec = vdupq_n_f32(0.0f);
+ float32_t sum = 0.0f;
+ float32_t in;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_f32(pSrc);
+ /*
+ * sum lanes
+ */
+ sumVec = vfmaq(sumVec, vecSrc, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+ sum = vecAddAcrossF32Mve(sumVec);
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x3;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ *pResult = sum;
+}
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_power_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Store the result to the destination */
+ *pResult = sum;
+}
+#else
+void arm_power_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result to destination */
+ *pResult = sum;
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of power group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c
new file mode 100644
index 0000000000..2ebbc9d3b7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q15.c
@@ -0,0 +1,177 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_power_q15.c
+ * Description: Sum of the squares of the elements of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup power
+ @{
+ */
+
+/**
+ @brief Sum of the squares of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 34.30 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_power_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecSrc;
+ q63_t sum = 0LL;
+ q15_t in;
+
+ /* Compute 8 outputs at a time */
+ blkCnt = blockSize >> 3U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_s16(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vmlaldavaq(sum, vecSrc, vecSrc);
+
+ blkCnt --;
+ pSrc += 8;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x7;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ *pResult = sum;
+}
+#else
+void arm_power_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+#else
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in 34.30 format */
+ *pResult = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of power group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c
new file mode 100644
index 0000000000..a39b3a709c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q31.c
@@ -0,0 +1,165 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_power_q31.c
+ * Description: Sum of the squares of the elements of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup power
+ @{
+ */
+
+/**
+ @brief Sum of the squares of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.31 format.
+ Intermediate multiplication yields a 2.62 format, and this
+ result is truncated to 2.48 format by discarding the lower 14 bits.
+ The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ With 15 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 16.48 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_power_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecSrc;
+ q63_t sum = 0LL;
+ q31_t in;
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_s32(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vrmlaldavhaq(sum, vecSrc, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x3;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 8;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ *pResult = asrl(sum, 6);
+}
+#else
+void arm_power_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q31_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14U;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14U;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14U;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14U;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store results in 16.48 format */
+ *pResult = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of power group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c
new file mode 100644
index 0000000000..1f2f6628c0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_power_q7.c
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_power_q7.c
+ * Description: Sum of the squares of the elements of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup power
+ @{
+ */
+
+/**
+ @brief Sum of the squares of the elements of a Q7 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult sum of the squares value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 32-bit internal accumulator.
+ The input is represented in 1.7 format.
+ Intermediate multiplication yields a 2.14 format, and this
+ result is added without saturation to an accumulator in 18.14 format.
+ With 17 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the return result is in 18.14 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_power_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q7x16_t vecSrc;
+ q31_t sum = 0LL;
+ q7_t in;
+
+ /* Compute 16 outputs at a time */
+ blkCnt = blockSize >> 4U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrbq_s8(pSrc);
+ /*
+ * sum lanes
+ */
+ sum = vmladavaq(sum, vecSrc, vecSrc);
+
+ blkCnt--;
+ pSrc += 16;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ *pResult = sum;
+}
+#else
+void arm_power_q7(
+ const q7_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Temporary result storage */
+ q7_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store packed input value */
+ q31_t in1, in2; /* Temporary variables to store input value */
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q7x4_ia ((q7_t **) &pSrc);
+
+ in1 = __SXTB16(__ROR(in32, 8));
+ in2 = __SXTB16(in32);
+
+ /* calculate power and accumulate to accumulator */
+ sum = __SMLAD(in1, in1, sum);
+ sum = __SMLAD(in2, in2, sum);
+#else
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute Power and store result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Store result in 18.14 format */
+ *pResult = sum;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of power group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c
new file mode 100644
index 0000000000..0310ef71d0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f16.c
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rms_f16.c
+ * Description: Root mean square value of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup RMS Root mean square (RMS)
+
+ Calculates the Root Mean Square of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+ Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_rms_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ float16_t pow = 0.0f;
+
+ arm_power_f16(pSrc, blockSize, &pow);
+
+ /* Compute Rms and store the result in the destination */
+ arm_sqrt_f16(pow / (float16_t) blockSize, pResult);
+}
+#else
+
+void arm_rms_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ _Float16 sum = 0.0f16; /* Temporary result storage */
+ _Float16 in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ( in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store result in destination */
+ arm_sqrt_f16(sum / (float16_t) blockSize, pResult);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of RMS group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c
new file mode 100644
index 0000000000..cb45752bca
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_f32.c
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rms_f32.c
+ * Description: Root mean square value of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup RMS Root mean square (RMS)
+
+ Calculates the Root Mean Square of the elements in the input vector.
+ The underlying algorithm is used:
+
+
+ Result = sqrt(((pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]) / blockSize));
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_rms_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t pow = 0.0f;
+
+ arm_power_f32(pSrc, blockSize, &pow);
+
+ /* Compute Rms and store the result in the destination */
+ arm_sqrt_f32(pow / (float32_t) blockSize, pResult);
+}
+#else
+#if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_rms_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store the result in the destination */
+ arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+#else
+void arm_rms_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ( in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store result in destination */
+ arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of RMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c
new file mode 100644
index 0000000000..da925eb063
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q15.c
@@ -0,0 +1,149 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rms_q15.c
+ * Description: Root Mean Square of the elements of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_rms_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ q63_t pow = 0.0f;
+ q15_t normalizedPower;
+
+ arm_power_q15(pSrc, blockSize, &pow);
+
+ normalizedPower=__SSAT((pow / (q63_t) blockSize) >> 15,16);
+ arm_sqrt_q15(normalizedPower, pResult);
+}
+#else
+void arm_rms_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ /* Compute sum of squares and store result in a temporary variable. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sum = __SMLALD(in32, in32, sum);
+#else
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ((q31_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Truncating and saturating the accumulator to 1.15 format */
+ /* Store result in destination */
+ arm_sqrt_q15(__SSAT((sum / (q63_t)blockSize) >> 15, 16), pResult);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of RMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c
new file mode 100644
index 0000000000..ba39b7b722
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_rms_q31.c
@@ -0,0 +1,141 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rms_q31.c
+ * Description: Root Mean Square of the elements of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup RMS
+ @{
+ */
+
+/**
+ @brief Root Mean Square of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult root mean square value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, and intermediate multiplication
+ yields a 2.62 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a single guard bit.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows, it wraps around and distorts the result.
+ In order to avoid overflows completely, the input signal must be scaled down by
+ log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_rms_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q63_t pow = 0.0f;
+ q31_t normalizedPower;
+ arm_power_q31(pSrc, blockSize, &pow);
+
+ normalizedPower=clip_q63_to_q31((pow / (q63_t) blockSize) >> 17);
+ arm_sqrt_q31(normalizedPower, pResult);
+
+}
+
+#else
+void arm_rms_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ uint64_t sum = 0; /* Temporary result storage (can get never negative. changed type from q63 to uint64 */
+ q31_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sum. */
+ sum += ((q63_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable. */
+ sum += ((q63_t) in * in);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
+ /* Compute Rms and store result in destination vector */
+ arm_sqrt_q31(clip_q63_to_q31((sum / (q63_t) blockSize) >> 31), pResult);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of RMS group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c
new file mode 100644
index 0000000000..813de351c2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f16.c
@@ -0,0 +1,67 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_std_f16.c
+ * Description: Standard deviation of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupStats
+ */
+
+
+
+/**
+ @addtogroup STD
+ @{
+ */
+
+/**
+ @brief Standard deviation of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
+ */
+void arm_std_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ float16_t var;
+ arm_var_f16(pSrc,blockSize,&var);
+ arm_sqrt_f16(var, pResult);
+}
+
+/**
+ @} end of STD group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c
new file mode 100644
index 0000000000..682443d911
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_f32.c
@@ -0,0 +1,83 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_std_f32.c
+ * Description: Standard deviation of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup STD Standard deviation
+
+ Calculates the standard deviation of the elements in the input vector.
+
+ The float implementation is relying on arm_var_f32 which is using a two-pass algorithm
+ to avoid problem of numerical instabilities and cancellation errors.
+
+ Fixed point versions are using the standard textbook algorithm since the fixed point
+ numerical behavior is different from the float one.
+
+ Algorithm for fixed point versions is summarized below:
+
+
+
+ Result = sqrt((sumOfSquares - sum2 / blockSize) / (blockSize - 1))
+
+ sumOfSquares = pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] * pSrc[blockSize-1]
+ sum = pSrc[0] + pSrc[1] + pSrc[2] + ... + pSrc[blockSize-1]
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ @addtogroup STD
+ @{
+ */
+
+/**
+ @brief Standard deviation of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
+ */
+void arm_std_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t var;
+ arm_var_f32(pSrc,blockSize,&var);
+ arm_sqrt_f32(var, pResult);
+}
+
+/**
+ @} end of STD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c
new file mode 100644
index 0000000000..74ee4f12ce
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q15.c
@@ -0,0 +1,173 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_std_q15.c
+ * Description: Standard deviation of an array of Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup STD
+ @{
+ */
+
+/**
+ @brief Standard deviation of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult standard deviation value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_std_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ q15_t var=0;
+
+ arm_var_q15(pSrc, blockSize, &var);
+ arm_sqrt_q15(var,pResult);
+}
+#else
+void arm_std_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
+#endif
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+#else
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += (in * in);
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
+
+ /* Compute square of mean */
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+ /* mean of squares minus the square of mean. */
+ /* Compute standard deviation and store result in destination */
+ arm_sqrt_q15(__SSAT((meanOfSquares - squareOfMean) >> 15U, 16U), pResult);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of STD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c
new file mode 100644
index 0000000000..63170e7988
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_std_q31.c
@@ -0,0 +1,159 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_std_q31.c
+ * Description: Standard deviation of the elements of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup STD
+ @{
+ */
+
+/**
+ @brief Standard deviation of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector.
+ @param[in] blockSize number of samples in input vector.
+ @param[out] pResult standard deviation value returned here.
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ but provides only a 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_std_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q31_t var=0;
+
+ arm_var_q31(pSrc, blockSize, &var);
+ arm_sqrt_q31(var, pResult);
+}
+#else
+void arm_std_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Accumulator */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
+
+ /* Compute square of mean */
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+ /* Compute standard deviation and store result in destination */
+ arm_sqrt_q31((meanOfSquares - squareOfMean) >> 15U, pResult);
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of STD group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c
new file mode 100644
index 0000000000..750acf28da
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f16.c
@@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_var_f16.c
+ * Description: Variance of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions_f16.h"
+
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupStats
+ */
+
+
+/**
+ @addtogroup variance
+ @{
+ */
+
+/**
+ @brief Variance of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+
+void arm_var_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ int32_t blkCnt; /* loop counters */
+ f16x8_t vecSrc;
+ f16x8_t sumVec = vdupq_n_f16((float16_t) 0.0);
+ float16_t fMean;
+
+ if (blockSize <= 1U) {
+ *pResult = 0;
+ return;
+ }
+
+
+ arm_mean_f16(pSrc, blockSize, &fMean);
+
+/* 6.14 bug */
+#if defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100) && (__ARMCC_VERSION < 6150001)
+ __asm volatile(
+ " vmov.i32 %[acc], #0 \n"
+ : [acc] "+t"(sumVec)
+ :
+ : );
+#endif
+
+ blkCnt = blockSize;
+ do {
+ mve_pred16_t p = vctp16q(blkCnt);
+
+ vecSrc = vldrhq_z_f16((float16_t const *) pSrc, p);
+ /*
+ * sum lanes
+ */
+ vecSrc = vsubq_m(vuninitializedq_f16(), vecSrc, fMean, p);
+ sumVec = vfmaq_m(sumVec, vecSrc, vecSrc, p);
+
+ blkCnt -= 8;
+ pSrc += 8;
+ }
+ while (blkCnt > 0);
+
+ /* Variance */
+ *pResult = vecAddAcrossF16Mve(sumVec) / (float16_t) (blockSize - 1.0f);
+}
+#else
+
+void arm_var_f16(
+ const float16_t * pSrc,
+ uint32_t blockSize,
+ float16_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ _Float16 sum = 0.0f; /* Temporary result storage */
+ _Float16 fSum = 0.0f;
+ _Float16 fMean, fValue;
+ const float16_t * pInput = pSrc;
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ fMean = sum / (float16_t) blockSize;
+
+ pInput = pSrc;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = fSum / (float16_t)(blockSize - 1.0f);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of variance group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c
new file mode 100644
index 0000000000..8129bcaa24
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_f32.c
@@ -0,0 +1,293 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_var_f32.c
+ * Description: Variance of the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @defgroup variance Variance
+
+ Calculates the variance of the elements in the input vector.
+ The underlying algorithm used is the direct method sometimes referred to as the two-pass method:
+
+
+ Result = sum(element - meanOfElements)^2) / numElement - 1
+
+ meanOfElements = ( pSrc[0] * pSrc[0] + pSrc[1] * pSrc[1] + ... + pSrc[blockSize-1] ) / blockSize
+
+
+ There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ @addtogroup variance
+ @{
+ */
+
+/**
+ @brief Variance of the elements of a floating-point vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+void arm_var_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ f32x4_t vecSrc;
+ f32x4_t sumVec = vdupq_n_f32(0.0f);
+ float32_t fMean;
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+
+ if (blockSize <= 1U) {
+ *pResult = 0;
+ return;
+ }
+
+ arm_mean_f32(pSrc, blockSize, &fMean);
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+
+ vecSrc = vldrwq_f32(pSrc);
+ /*
+ * sum lanes
+ */
+ vecSrc = vsubq(vecSrc, fMean);
+ sumVec = vfmaq(sumVec, vecSrc, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+
+ sum = vecAddAcrossF32Mve(sumVec);
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x3;
+ while (blkCnt > 0U)
+ {
+ in = *pSrc++ - fMean;
+ sum += in * in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = sum / (float32_t) (blockSize - 1);
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_var_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t mean;
+
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t sumV = vdupq_n_f32(0.0f); /* Temporary result storage */
+ float32x2_t sumV2;
+ float32x4_t inV;
+ float32x4_t avg;
+
+ arm_mean_f32(pSrc,blockSize,&mean);
+ avg = vdupq_n_f32(mean);
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ inV = vld1q_f32(pSrc);
+ inV = vsubq_f32(inV, avg);
+ sumV = vmlaq_f32(sumV, inV, inV);
+ pSrc += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ sumV2 = vpadd_f32(vget_low_f32(sumV),vget_high_f32(sumV));
+ sum = vget_lane_f32(sumV2, 0) + vget_lane_f32(sumV2, 1);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ in = in - mean;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = sum / (float32_t)(blockSize - 1.0f);
+
+}
+
+#else
+void arm_var_f32(
+ const float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t fSum = 0.0f;
+ float32_t fMean, fValue;
+ const float32_t * pInput = pSrc;
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+ sum += *pInput++;
+
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+
+ sum += *pInput++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ fMean = sum / (float32_t) blockSize;
+
+ pInput = pSrc;
+
+#if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ fValue = *pInput++ - fMean;
+ fSum += fValue * fValue;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Variance */
+ *pResult = fSum / (float32_t)(blockSize - 1.0f);
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of variance group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c
new file mode 100644
index 0000000000..4e6e6a5a1c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q15.c
@@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_var_q15.c
+ * Description: Variance of an array of Q15 type
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup variance
+ @{
+ */
+
+/**
+ @brief Variance of the elements of a Q15 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using a 64-bit internal accumulator.
+ The input is represented in 1.15 format.
+ Intermediate multiplication yields a 2.30 format, and this
+ result is added without saturation to a 64-bit accumulator in 34.30 format.
+ With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ full precision of the intermediate multiplication is preserved.
+ Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_var_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecSrc;
+ q63_t sumOfSquares = 0LL;
+ q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
+ q63_t sum = 0LL;
+ q15_t in;
+
+ if (blockSize <= 1U) {
+ *pResult = 0;
+ return;
+ }
+
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrhq_s16(pSrc);
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+
+ sumOfSquares = vmlaldavaq_s16(sumOfSquares, vecSrc, vecSrc);
+ sum = vaddvaq_s16(sum, vecSrc);
+
+ blkCnt --;
+ pSrc += 8;
+ }
+
+ /* Tail */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+#if defined (ARM_MATH_DSP)
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+#else
+ sumOfSquares += (in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ meanOfSquares = arm_div_q63_to_q31(sumOfSquares, (blockSize - 1U));
+
+ /* Compute square of mean */
+ squareOfMean = arm_div_q63_to_q31((q63_t)sum * sum, (q31_t)(blockSize * (blockSize - 1U)));
+
+ /* mean of the squares minus the square of the mean. */
+ *pResult = (meanOfSquares - squareOfMean) >> 15;
+}
+#else
+void arm_var_q15(
+ const q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q15_t in; /* Temporary variable to store input value */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in32; /* Temporary variable to store input value */
+#endif
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ /* Compute sum and store result in a temporary variable, sum. */
+#if defined (ARM_MATH_DSP)
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+
+ in32 = read_q15x2_ia ((q15_t **) &pSrc);
+ sumOfSquares = __SMLALD(in32, in32, sumOfSquares);
+ sum += ((in32 << 16U) >> 16U);
+ sum += (in32 >> 16U);
+#else
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+ sum += in;
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+#if defined (ARM_MATH_DSP)
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+#else
+ sumOfSquares += (in * in);
+#endif /* #if defined (ARM_MATH_DSP) */
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (q31_t) (sumOfSquares / (q63_t)(blockSize - 1U));
+
+ /* Compute square of mean */
+ squareOfMean = (q31_t) ((q63_t) sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+ /* mean of squares minus the square of mean. */
+ *pResult = (meanOfSquares - squareOfMean) >> 15U;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of variance group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c
new file mode 100644
index 0000000000..e9fc453033
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/StatisticsFunctions/arm_var_q31.c
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_var_q31.c
+ * Description: Variance of an array of Q31 type
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/statistics_functions.h"
+
+/**
+ @ingroup groupStats
+ */
+
+/**
+ @addtogroup variance
+ @{
+ */
+
+/**
+ @brief Variance of the elements of a Q31 vector.
+ @param[in] pSrc points to the input vector
+ @param[in] blockSize number of samples in input vector
+ @param[out] pResult variance value returned here
+ @return none
+
+ @par Scaling and Overflow Behavior
+ The function is implemented using an internal 64-bit accumulator.
+ The input is represented in 1.31 format, which is then downshifted by 8 bits
+ which yields 1.23, and intermediate multiplication yields a 2.46 format.
+ The accumulator maintains full precision of the intermediate multiplication results,
+ and as a consequence has only 16 guard bits.
+ There is no saturation on intermediate additions.
+ If the accumulator overflows it wraps around and distorts the result.
+ In order to avoid overflows completely the input signal must be scaled down by
+ log2(blockSize)-8 bits, as a total of blockSize additions are performed internally.
+ After division, internal variables should be Q18.46
+ Finally, the 18.46 accumulator is right shifted by 15 bits to yield a 1.31 format value.
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_var_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecSrc;
+ q63_t sumOfSquares = 0LL;
+ q63_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
+ q63_t sum = 0LL;
+ q31_t in;
+
+ if (blockSize <= 1U) {
+ *pResult = 0;
+ return;
+ }
+
+
+ /* Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+ while (blkCnt > 0U)
+ {
+ vecSrc = vldrwq_s32(pSrc);
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+
+ /* downscale */
+ vecSrc = vshrq(vecSrc, 8);
+ sumOfSquares = vmlaldavaq(sumOfSquares, vecSrc, vecSrc);
+ sum = vaddlvaq(sum, vecSrc);
+
+ blkCnt --;
+ pSrc += 4;
+ }
+
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0x3;
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ meanOfSquares = sumOfSquares / (q63_t) (blockSize - 1U);
+
+ /* Compute square of mean */
+ squareOfMean = sum * sum / (q63_t) (blockSize * (blockSize - 1U));
+
+ /* Compute standard deviation and then store the result to the destination */
+ *pResult = asrl(meanOfSquares - squareOfMean, 15U);
+}
+#else
+void arm_var_q31(
+ const q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ uint32_t blkCnt; /* Loop counter */
+ q63_t sum = 0; /* Temporary result storage */
+ q63_t meanOfSquares, squareOfMean; /* Square of mean and mean of square */
+ q63_t sumOfSquares = 0; /* Sum of squares */
+ q31_t in; /* Temporary variable to store input value */
+
+ if (blockSize <= 1U)
+ {
+ *pResult = 0;
+ return;
+ }
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ in = *pSrc++ >> 8U;
+ sumOfSquares += ((q63_t) (in) * (in));
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* C = A[0] + A[1] + ... + A[blockSize-1] */
+
+ in = *pSrc++ >> 8U;
+ /* Compute sum of squares and store result in a temporary variable, sumOfSquares. */
+ sumOfSquares += ((q63_t) (in) * (in));
+ /* Compute sum and store result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares and store result in a temporary variable, meanOfSquares. */
+ meanOfSquares = (sumOfSquares / (q63_t)(blockSize - 1U));
+
+ /* Compute square of mean */
+ squareOfMean = ( sum * sum / (q63_t)(blockSize * (blockSize - 1U)));
+
+ /* Compute variance and store result in destination */
+ *pResult = (meanOfSquares - squareOfMean) >> 15U;
+}
+#endif
+/**
+ @} end of variance group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
new file mode 100644
index 0000000000..6beee5d482
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctions.c
@@ -0,0 +1,61 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: SupportFunctions.c
+ * Description: Combination of all support function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_barycenter_f32.c"
+#include "arm_bitonic_sort_f32.c"
+#include "arm_bubble_sort_f32.c"
+#include "arm_copy_f32.c"
+#include "arm_copy_q15.c"
+#include "arm_copy_q31.c"
+#include "arm_copy_q7.c"
+#include "arm_fill_f32.c"
+#include "arm_fill_q15.c"
+#include "arm_fill_q31.c"
+#include "arm_fill_q7.c"
+#include "arm_heap_sort_f32.c"
+#include "arm_insertion_sort_f32.c"
+#include "arm_merge_sort_f32.c"
+#include "arm_merge_sort_init_f32.c"
+#include "arm_quick_sort_f32.c"
+#include "arm_selection_sort_f32.c"
+#include "arm_sort_f32.c"
+#include "arm_sort_init_f32.c"
+#include "arm_weighted_sum_f32.c"
+
+#include "arm_float_to_q15.c"
+#include "arm_float_to_q31.c"
+#include "arm_float_to_q7.c"
+#include "arm_q15_to_float.c"
+#include "arm_q15_to_q31.c"
+#include "arm_q15_to_q7.c"
+#include "arm_q31_to_float.c"
+#include "arm_q31_to_q15.c"
+#include "arm_q31_to_q7.c"
+#include "arm_q7_to_float.c"
+#include "arm_q7_to_q15.c"
+#include "arm_q7_to_q31.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
new file mode 100644
index 0000000000..0e39d8d1a3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/SupportFunctionsF16.c
@@ -0,0 +1,36 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: SupportFunctions.c
+ * Description: Combination of all support function source files.
+ *
+ * $Date: 16. March 2020
+ * $Revision: V1.1.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019-2020 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_copy_f16.c"
+#include "arm_fill_f16.c"
+#include "arm_f16_to_q15.c"
+#include "arm_f16_to_float.c"
+#include "arm_q15_to_f16.c"
+#include "arm_float_to_f16.c"
+#include "arm_weighted_sum_f16.c"
+#include "arm_barycenter_f16.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
new file mode 100644
index 0000000000..0d0bb41c5b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f16.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_barycenter_f16.c
+ * Description: Barycenter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+#include
+#include
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup barycenter Barycenter
+
+ Barycenter of weighted vectors
+ */
+
+/**
+ @addtogroup barycenter
+ @{
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in] *in List of vectors
+ * @param[in] *weights Weights of the vectors
+ * @param[out] *out Barycenter
+ * @param[in] nbVectors Number of vectors
+ * @param[in] vecDim Dimension of space (vector dimension)
+ * @return None
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_barycenter_f16(const float16_t *in,
+ const float16_t *weights,
+ float16_t *out,
+ uint32_t nbVectors,
+ uint32_t vecDim)
+{
+ const float16_t *pIn, *pW;
+ const float16_t *pIn1, *pIn2, *pIn3, *pIn4;
+ float16_t *pOut;
+ uint32_t blkCntVector, blkCntSample;
+ float16_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+
+ arm_fill_f16(0.0f, out, vecDim);
+
+
+ /* Sum */
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while (blkCntVector > 0)
+ {
+ f16x8_t outV, inV1, inV2, inV3, inV4;
+ float16_t w1, w2, w3, w4;
+
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += w1 + w2 + w3 + w4;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0) {
+ outV = vld1q((const float16_t *) pOut);
+ inV1 = vld1q(pIn1);
+ inV2 = vld1q(pIn2);
+ inV3 = vld1q(pIn3);
+ inV4 = vld1q(pIn4);
+ outV = vfmaq(outV, inV1, w1);
+ outV = vfmaq(outV, inV2, w2);
+ outV = vfmaq(outV, inV3, w3);
+ outV = vfmaq(outV, inV4, w4);
+ vst1q(pOut, outV);
+
+ pOut += 8;
+ pIn1 += 8;
+ pIn2 += 8;
+ pIn3 += 8;
+ pIn4 += 8;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0) {
+ *pOut = *pOut + *pIn1++ * w1;
+ *pOut = *pOut + *pIn2++ * w2;
+ *pOut = *pOut + *pIn3++ * w3;
+ *pOut = *pOut + *pIn4++ * w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3 * vecDim;
+ pIn2 += 3 * vecDim;
+ pIn3 += 3 * vecDim;
+ pIn4 += 3 * vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while (blkCntVector > 0)
+ {
+ f16x8_t inV, outV;
+
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0)
+ {
+ outV = vld1q_f16(pOut);
+ inV = vld1q_f16(pIn);
+ outV = vfmaq(outV, inV, w);
+ vst1q_f16(pOut, outV);
+ pOut += 8;
+ pIn += 8;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f / accum;
+
+ blkCntSample = vecDim >> 3;
+ while (blkCntSample > 0)
+ {
+ f16x8_t tmp;
+
+ tmp = vld1q((const float16_t *) pOut);
+ tmp = vmulq(tmp, accum);
+ vst1q(pOut, tmp);
+ pOut += 8;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 7;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut * accum;
+ pOut++;
+ blkCntSample--;
+ }
+}
+#else
+void arm_barycenter_f16(const float16_t *in, const float16_t *weights, float16_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float16_t *pIn,*pW;
+ float16_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float16_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut / accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
new file mode 100644
index 0000000000..108f470798
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_barycenter_f32.c
@@ -0,0 +1,414 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_barycenter_f32.c
+ * Description: Barycenter
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include
+#include
+
+
+/**
+ @ingroup barycenter
+ */
+
+
+/**
+ * @brief Barycenter
+ *
+ *
+ * @param[in] *in List of vectors
+ * @param[in] *weights Weights of the vectors
+ * @param[out] *out Barycenter
+ * @param[in] nbVectors Number of vectors
+ * @param[in] vecDim Dimension of space (vector dimension)
+ * @return None
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_barycenter_f32(const float32_t *in,
+ const float32_t *weights,
+ float32_t *out,
+ uint32_t nbVectors,
+ uint32_t vecDim)
+{
+ const float32_t *pIn, *pW;
+ const float32_t *pIn1, *pIn2, *pIn3, *pIn4;
+ float32_t *pOut;
+ uint32_t blkCntVector, blkCntSample;
+ float32_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+
+ arm_fill_f32(0.0f, out, vecDim);
+
+
+ /* Sum */
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while (blkCntVector > 0)
+ {
+ f32x4_t outV, inV1, inV2, inV3, inV4;
+ float32_t w1, w2, w3, w4;
+
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += w1 + w2 + w3 + w4;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0) {
+ outV = vld1q((const float32_t *) pOut);
+ inV1 = vld1q(pIn1);
+ inV2 = vld1q(pIn2);
+ inV3 = vld1q(pIn3);
+ inV4 = vld1q(pIn4);
+ outV = vfmaq(outV, inV1, w1);
+ outV = vfmaq(outV, inV2, w2);
+ outV = vfmaq(outV, inV3, w3);
+ outV = vfmaq(outV, inV4, w4);
+ vst1q(pOut, outV);
+
+ pOut += 4;
+ pIn1 += 4;
+ pIn2 += 4;
+ pIn3 += 4;
+ pIn4 += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0) {
+ *pOut = *pOut + *pIn1++ * w1;
+ *pOut = *pOut + *pIn2++ * w2;
+ *pOut = *pOut + *pIn3++ * w3;
+ *pOut = *pOut + *pIn4++ * w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3 * vecDim;
+ pIn2 += 3 * vecDim;
+ pIn3 += 3 * vecDim;
+ pIn4 += 3 * vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while (blkCntVector > 0)
+ {
+ f32x4_t inV, outV;
+
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV = vld1q_f32(pIn);
+ outV = vfmaq(outV, inV, w);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f / accum;
+
+ blkCntSample = vecDim >> 2;
+ while (blkCntSample > 0)
+ {
+ f32x4_t tmp;
+
+ tmp = vld1q((const float32_t *) pOut);
+ tmp = vmulq(tmp, accum);
+ vst1q(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while (blkCntSample > 0)
+ {
+ *pOut = *pOut * accum;
+ pOut++;
+ blkCntSample--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float32_t *pIn,*pW, *pIn1, *pIn2, *pIn3, *pIn4;
+ float32_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float32_t accum, w,w1,w2,w3,w4;
+
+ float32x4_t tmp, inV,outV, inV1, inV2, inV3, inV4;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ tmp = vdupq_n_f32(0.0f);
+ pOut = out;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ vst1q_f32(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+
+ pIn1 = pIn;
+ pIn2 = pIn1 + vecDim;
+ pIn3 = pIn2 + vecDim;
+ pIn4 = pIn3 + vecDim;
+
+ blkCntVector = nbVectors >> 2;
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w1 = *pW++;
+ w2 = *pW++;
+ w3 = *pW++;
+ w4 = *pW++;
+ accum += w1 + w2 + w3 + w4;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV1 = vld1q_f32(pIn1);
+ inV2 = vld1q_f32(pIn2);
+ inV3 = vld1q_f32(pIn3);
+ inV4 = vld1q_f32(pIn4);
+ outV = vmlaq_n_f32(outV,inV1,w1);
+ outV = vmlaq_n_f32(outV,inV2,w2);
+ outV = vmlaq_n_f32(outV,inV3,w3);
+ outV = vmlaq_n_f32(outV,inV4,w4);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn1 += 4;
+ pIn2 += 4;
+ pIn3 += 4;
+ pIn4 += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn1++ * w1;
+ *pOut = *pOut + *pIn2++ * w2;
+ *pOut = *pOut + *pIn3++ * w3;
+ *pOut = *pOut + *pIn4++ * w4;
+ pOut++;
+ blkCntSample--;
+ }
+
+ pIn1 += 3*vecDim;
+ pIn2 += 3*vecDim;
+ pIn3 += 3*vecDim;
+ pIn4 += 3*vecDim;
+
+ blkCntVector--;
+ }
+
+ pIn = pIn1;
+
+ blkCntVector = nbVectors & 3;
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ outV = vld1q_f32(pOut);
+ inV = vld1q_f32(pIn);
+ outV = vmlaq_n_f32(outV,inV,w);
+ vst1q_f32(pOut, outV);
+ pOut += 4;
+ pIn += 4;
+
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ pOut = out;
+ accum = 1.0f / accum;
+
+ blkCntSample = vecDim >> 2;
+ while(blkCntSample > 0)
+ {
+ tmp = vld1q_f32(pOut);
+ tmp = vmulq_n_f32(tmp,accum);
+ vst1q_f32(pOut, tmp);
+ pOut += 4;
+ blkCntSample--;
+ }
+
+ blkCntSample = vecDim & 3;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut * accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#else
+void arm_barycenter_f32(const float32_t *in, const float32_t *weights, float32_t *out, uint32_t nbVectors,uint32_t vecDim)
+{
+
+ const float32_t *pIn,*pW;
+ float32_t *pOut;
+ uint32_t blkCntVector,blkCntSample;
+ float32_t accum, w;
+
+ blkCntVector = nbVectors;
+ blkCntSample = vecDim;
+
+ accum = 0.0f;
+
+ pW = weights;
+ pIn = in;
+
+ /* Set counters to 0 */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = 0.0f;
+ pOut++;
+ blkCntSample--;
+ }
+
+ /* Sum */
+ while(blkCntVector > 0)
+ {
+ pOut = out;
+ w = *pW++;
+ accum += w;
+
+ blkCntSample = vecDim;
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut + *pIn++ * w;
+ pOut++;
+ blkCntSample--;
+ }
+
+ blkCntVector--;
+ }
+
+ /* Normalize */
+ blkCntSample = vecDim;
+ pOut = out;
+
+ while(blkCntSample > 0)
+ {
+ *pOut = *pOut / accum;
+ pOut++;
+ blkCntSample--;
+ }
+
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of barycenter group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
new file mode 100644
index 0000000000..f316a4cc9d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_bitonic_sort_f32.c
@@ -0,0 +1,1039 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitonic_sort_f32.c
+ * Description: Floating point bitonic sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+#if !defined(ARM_MATH_NEON)
+
+static void arm_bitonic_sort_core_f32(float32_t *pSrc, uint32_t n, uint8_t dir)
+{
+ uint32_t step;
+ uint32_t k, j;
+ float32_t *leftPtr, *rightPtr;
+ float32_t temp;
+
+ step = n>>1;
+ leftPtr = pSrc;
+ rightPtr = pSrc+n-1;
+
+ for(k=0; k *rightPtr))
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++; // Move right
+ rightPtr--; // Move left
+ }
+
+ // Merge
+ for(step=(n>>2); step>0; step/=2)
+ {
+ for(j=0; j *rightPtr)
+ {
+ // Swap
+ temp=*leftPtr;
+ *leftPtr=*rightPtr;
+ *rightPtr=temp;
+ }
+
+ leftPtr++;
+ rightPtr++;
+ }
+ }
+ }
+}
+#endif
+
+#if defined(ARM_MATH_NEON)
+
+
+static float32x4x2_t arm_bitonic_resort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+
+ * | a | b | c | d |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | e | f | g | h |
+ * +---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh])
+ * vminmax([acbd],[egfh])
+ */
+ vtrn128_64q(a, b);
+ /* +---+---+---+---+
+ * | a | b | e | f |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | c | d | g | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ vtrn128_32q(a, b);
+ /* +---+---+---+---+
+ * | a | c | e | g |
+ * +---+---+---+---+
+ * +---+---+---+---+
+ * | b | d | f | h |
+ * +---+---+---+---+
+ */
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ return vzipq_f32(a, b);
+}
+
+
+static float32x4x2_t arm_bitonic_merge_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ /* a and b are guaranteed to be bitonic */
+ // Reverse the element of the second vector
+ b = vrev128q_f32(b);
+
+ // Compare the two vectors
+ if(dir)
+ vminmaxq(a, b);
+ else
+ vminmaxq(b, a);
+
+ // Merge the two vectors
+ float32x4x2_t ab = arm_bitonic_resort_8_f32(a, b, dir);
+
+ return ab;
+}
+
+static void arm_bitonic_resort_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ /* Start with two vectors:
+ * +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | e | f | g | h |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | i | j | k | l | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ * All the elements of the first are guaranteed to be less than or equal to
+ * all of the elements in the second, and both vectors are bitonic.
+ * We need to perform these operations to completely sort both lists:
+ * vminmax([abcd],[efgh]) vminmax([ijkl],[mnop])
+ * vminmax([abef],[cdgh]) vminmax([ijmn],[klop])
+ * vminmax([acef],[bdfh]) vminmax([ikmo],[jlmp])
+ */
+
+ vtrn256_128q(a, b);
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | c | d | i | j | k | l |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | e | f | g | h | m | n | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_64q(a, b);
+
+ /* +---+---+---+---+---+---+---+---+
+ * | a | b | e | f | i | j | m | n |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | c | d | g | h | k | l | o | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ vtrn256_32q(a, b);
+ /* We now have:
+ * +---+---+---+---+---+---+---+---+
+ * | a | c | e | g | i | k | m | o |
+ * +---+---+---+---+---+---+---+---+
+ * +---+---+---+---+---+---+---+---+
+ * | b | d | f | h | j | l | n | p |
+ * +---+---+---+---+---+---+---+---+
+ */
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ float32x4x2_t out1 = vzipq_f32(a.val[0], b.val[0]);
+ float32x4x2_t out2 = vzipq_f32(a.val[1], b.val[1]);
+
+ vst1q_f32(pOut, out1.val[0]);
+ vst1q_f32(pOut+4, out1.val[1]);
+ vst1q_f32(pOut+8, out2.val[0]);
+ vst1q_f32(pOut+12, out2.val[1]);
+}
+
+static void arm_bitonic_merge_16_f32(float32_t * pOut, float32x4x2_t a, float32x4x2_t b, uint8_t dir)
+{
+ // Merge two preordered float32x4x2_t
+ vrev256q_f32(b);
+
+ if(dir)
+ vminmax256q(a, b);
+ else
+ vminmax256q(b, a);
+
+ arm_bitonic_resort_16_f32(pOut, a, b, dir);
+}
+
+static void arm_bitonic_sort_16_f32(float32_t *pSrc, float32_t *pDst, uint8_t dir)
+{
+ float32x4_t a;
+ float32x4_t b;
+ float32x4_t c;
+ float32x4_t d;
+
+ // Load 16 samples
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+ c = vld1q_f32(pSrc+8);
+ d = vld1q_f32(pSrc+12);
+
+ // Bitonic sorting network for 4 samples x 4 times
+ if(dir)
+ {
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+
+ vminmaxq(a, d);
+ vminmaxq(b, c);
+
+ vminmaxq(a, b);
+ vminmaxq(c, d);
+ }
+ else
+ {
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+
+ vminmaxq(d, a);
+ vminmaxq(c, b);
+
+ vminmaxq(b, a);
+ vminmaxq(d, c);
+ }
+
+ float32x4x2_t ab = vtrnq_f32 (a, b);
+ float32x4x2_t cd = vtrnq_f32 (c, d);
+
+ // Transpose 4 ordered arrays of 4 samples
+ a = vcombine_f32(vget_low_f32(ab.val[0]), vget_low_f32(cd.val[0]));
+ b = vcombine_f32(vget_low_f32(ab.val[1]), vget_low_f32(cd.val[1]));
+ c = vcombine_f32(vget_high_f32(ab.val[0]), vget_high_f32(cd.val[0]));
+ d = vcombine_f32(vget_high_f32(ab.val[1]), vget_high_f32(cd.val[1]));
+
+ // Merge pairs of arrays of 4 samples
+ ab = arm_bitonic_merge_8_f32(a, b, dir);
+ cd = arm_bitonic_merge_8_f32(c, d, dir);
+
+ // Merge arrays of 8 samples
+ arm_bitonic_merge_16_f32(pDst, ab, cd, dir);
+}
+
+
+
+
+
+static void arm_bitonic_merge_32_f32(float32_t * pSrc, float32x4x2_t ab1, float32x4x2_t ab2, float32x4x2_t cd1, float32x4x2_t cd2, uint8_t dir)
+{
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+ //Transpose 256
+ float32x4_t temp;
+
+ temp = ab2.val[0];
+ ab2.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab2.val[1];
+ ab2.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ }
+
+ //Transpose 128
+ arm_bitonic_merge_16_f32(pSrc+0, ab1, cd1, dir);
+ arm_bitonic_merge_16_f32(pSrc+16, ab2, cd2, dir);
+}
+
+static void arm_bitonic_merge_64_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4;
+ float32x4x2_t cd1, cd2, cd3, cd4;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+
+ vldrev128q_f32(cd4.val[1], pSrc+32);
+ vldrev128q_f32(cd4.val[0], pSrc+36);
+ vldrev128q_f32(cd3.val[1], pSrc+40);
+ vldrev128q_f32(cd3.val[0], pSrc+44);
+ vldrev128q_f32(cd2.val[1], pSrc+48);
+ vldrev128q_f32(cd2.val[0], pSrc+52);
+ vldrev128q_f32(cd1.val[1], pSrc+56);
+ vldrev128q_f32(cd1.val[0], pSrc+60);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 512
+ float32x4_t temp;
+
+ temp = ab3.val[0];
+ ab3.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab3.val[1];
+ ab3.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab4.val[0];
+ ab4.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab4.val[1];
+ ab4.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ }
+
+ //Transpose 256
+ arm_bitonic_merge_32_f32(pSrc+0, ab1, ab2, cd1, cd2, dir);
+ arm_bitonic_merge_32_f32(pSrc+32, ab3, ab4, cd3, cd4, dir);
+}
+
+static void arm_bitonic_merge_128_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12);
+ ab3.val[0] = vld1q_f32(pSrc+16);
+ ab3.val[1] = vld1q_f32(pSrc+20);
+ ab4.val[0] = vld1q_f32(pSrc+24);
+ ab4.val[1] = vld1q_f32(pSrc+28);
+ ab5.val[0] = vld1q_f32(pSrc+32);
+ ab5.val[1] = vld1q_f32(pSrc+36);
+ ab6.val[0] = vld1q_f32(pSrc+40);
+ ab6.val[1] = vld1q_f32(pSrc+44);
+ ab7.val[0] = vld1q_f32(pSrc+48);
+ ab7.val[1] = vld1q_f32(pSrc+52);
+ ab8.val[0] = vld1q_f32(pSrc+56);
+ ab8.val[1] = vld1q_f32(pSrc+60);
+
+ vldrev128q_f32(cd8.val[1], pSrc+64);
+ vldrev128q_f32(cd8.val[0], pSrc+68);
+ vldrev128q_f32(cd7.val[1], pSrc+72);
+ vldrev128q_f32(cd7.val[0], pSrc+76);
+ vldrev128q_f32(cd6.val[1], pSrc+80);
+ vldrev128q_f32(cd6.val[0], pSrc+84);
+ vldrev128q_f32(cd5.val[1], pSrc+88);
+ vldrev128q_f32(cd5.val[0], pSrc+92);
+ vldrev128q_f32(cd4.val[1], pSrc+96);
+ vldrev128q_f32(cd4.val[0], pSrc+100);
+ vldrev128q_f32(cd3.val[1], pSrc+104);
+ vldrev128q_f32(cd3.val[0], pSrc+108);
+ vldrev128q_f32(cd2.val[1], pSrc+112);
+ vldrev128q_f32(cd2.val[0], pSrc+116);
+ vldrev128q_f32(cd1.val[1], pSrc+120);
+ vldrev128q_f32(cd1.val[0], pSrc+124);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab5.val[0];
+ ab5.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab5.val[1];
+ ab5.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab6.val[0];
+ ab6.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab6.val[1];
+ ab6.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab7.val[0];
+ ab7.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab7.val[1];
+ ab7.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab8.val[0];
+ ab8.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab8.val[1];
+ ab8.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1, cd1);
+ vminmax256q(ab2, cd2);
+ vminmax256q(ab3, cd3);
+ vminmax256q(ab4, cd4);
+ vminmax256q(ab5, cd5);
+ vminmax256q(ab6, cd6);
+ vminmax256q(ab7, cd7);
+ vminmax256q(ab8, cd8);
+ }
+ else
+ {
+ vminmax256q(cd1, ab1);
+ vminmax256q(cd2, ab2);
+ vminmax256q(cd3, ab3);
+ vminmax256q(cd4, ab4);
+ vminmax256q(cd5, ab5);
+ vminmax256q(cd6, ab6);
+ vminmax256q(cd7, ab7);
+ vminmax256q(cd8, ab8);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0]);
+ vst1q_f32(pSrc+4, ab1.val[1]);
+ vst1q_f32(pSrc+8, ab2.val[0]);
+ vst1q_f32(pSrc+12, ab2.val[1]);
+ vst1q_f32(pSrc+16, ab3.val[0]);
+ vst1q_f32(pSrc+20, ab3.val[1]);
+ vst1q_f32(pSrc+24, ab4.val[0]);
+ vst1q_f32(pSrc+28, ab4.val[1]);
+ vst1q_f32(pSrc+32, cd1.val[0]);
+ vst1q_f32(pSrc+36, cd1.val[1]);
+ vst1q_f32(pSrc+40, cd2.val[0]);
+ vst1q_f32(pSrc+44, cd2.val[1]);
+ vst1q_f32(pSrc+48, cd3.val[0]);
+ vst1q_f32(pSrc+52, cd3.val[1]);
+ vst1q_f32(pSrc+56, cd4.val[0]);
+ vst1q_f32(pSrc+60, cd4.val[1]);
+ vst1q_f32(pSrc+64, ab5.val[0]);
+ vst1q_f32(pSrc+68, ab5.val[1]);
+ vst1q_f32(pSrc+72, ab6.val[0]);
+ vst1q_f32(pSrc+76, ab6.val[1]);
+ vst1q_f32(pSrc+80, ab7.val[0]);
+ vst1q_f32(pSrc+84, ab7.val[1]);
+ vst1q_f32(pSrc+88, ab8.val[0]);
+ vst1q_f32(pSrc+92, ab8.val[1]);
+ vst1q_f32(pSrc+96, cd5.val[0]);
+ vst1q_f32(pSrc+100, cd5.val[1]);
+ vst1q_f32(pSrc+104, cd6.val[0]);
+ vst1q_f32(pSrc+108, cd6.val[1]);
+ vst1q_f32(pSrc+112, cd7.val[0]);
+ vst1q_f32(pSrc+116, cd7.val[1]);
+ vst1q_f32(pSrc+120, cd8.val[0]);
+ vst1q_f32(pSrc+124, cd8.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_64_f32(pSrc+0 , dir);
+ arm_bitonic_merge_64_f32(pSrc+64, dir);
+}
+
+static void arm_bitonic_merge_256_f32(float32_t * pSrc, uint8_t dir)
+{
+ float32x4x2_t ab1, ab2, ab3, ab4, ab5, ab6, ab7, ab8;
+ float32x4x2_t ab9, ab10, ab11, ab12, ab13, ab14, ab15, ab16;
+ float32x4x2_t cd1, cd2, cd3, cd4, cd5, cd6, cd7, cd8;
+ float32x4x2_t cd9, cd10, cd11, cd12, cd13, cd14, cd15, cd16;
+
+ //Load and reverse second array
+ ab1.val[0] = vld1q_f32(pSrc+0 );
+ ab1.val[1] = vld1q_f32(pSrc+4 );
+ ab2.val[0] = vld1q_f32(pSrc+8 );
+ ab2.val[1] = vld1q_f32(pSrc+12 );
+ ab3.val[0] = vld1q_f32(pSrc+16 );
+ ab3.val[1] = vld1q_f32(pSrc+20 );
+ ab4.val[0] = vld1q_f32(pSrc+24 );
+ ab4.val[1] = vld1q_f32(pSrc+28 );
+ ab5.val[0] = vld1q_f32(pSrc+32 );
+ ab5.val[1] = vld1q_f32(pSrc+36 );
+ ab6.val[0] = vld1q_f32(pSrc+40 );
+ ab6.val[1] = vld1q_f32(pSrc+44 );
+ ab7.val[0] = vld1q_f32(pSrc+48 );
+ ab7.val[1] = vld1q_f32(pSrc+52 );
+ ab8.val[0] = vld1q_f32(pSrc+56 );
+ ab8.val[1] = vld1q_f32(pSrc+60 );
+ ab9.val[0] = vld1q_f32(pSrc+64 );
+ ab9.val[1] = vld1q_f32(pSrc+68 );
+ ab10.val[0] = vld1q_f32(pSrc+72 );
+ ab10.val[1] = vld1q_f32(pSrc+76 );
+ ab11.val[0] = vld1q_f32(pSrc+80 );
+ ab11.val[1] = vld1q_f32(pSrc+84 );
+ ab12.val[0] = vld1q_f32(pSrc+88 );
+ ab12.val[1] = vld1q_f32(pSrc+92 );
+ ab13.val[0] = vld1q_f32(pSrc+96 );
+ ab13.val[1] = vld1q_f32(pSrc+100);
+ ab14.val[0] = vld1q_f32(pSrc+104);
+ ab14.val[1] = vld1q_f32(pSrc+108);
+ ab15.val[0] = vld1q_f32(pSrc+112);
+ ab15.val[1] = vld1q_f32(pSrc+116);
+ ab16.val[0] = vld1q_f32(pSrc+120);
+ ab16.val[1] = vld1q_f32(pSrc+124);
+
+ vldrev128q_f32(cd16.val[1], pSrc+128);
+ vldrev128q_f32(cd16.val[0], pSrc+132);
+ vldrev128q_f32(cd15.val[1], pSrc+136);
+ vldrev128q_f32(cd15.val[0], pSrc+140);
+ vldrev128q_f32(cd14.val[1], pSrc+144);
+ vldrev128q_f32(cd14.val[0], pSrc+148);
+ vldrev128q_f32(cd13.val[1], pSrc+152);
+ vldrev128q_f32(cd13.val[0], pSrc+156);
+ vldrev128q_f32(cd12.val[1], pSrc+160);
+ vldrev128q_f32(cd12.val[0], pSrc+164);
+ vldrev128q_f32(cd11.val[1], pSrc+168);
+ vldrev128q_f32(cd11.val[0], pSrc+172);
+ vldrev128q_f32(cd10.val[1], pSrc+176);
+ vldrev128q_f32(cd10.val[0], pSrc+180);
+ vldrev128q_f32(cd9.val[1] , pSrc+184);
+ vldrev128q_f32(cd9.val[0] , pSrc+188);
+ vldrev128q_f32(cd8.val[1] , pSrc+192);
+ vldrev128q_f32(cd8.val[0] , pSrc+196);
+ vldrev128q_f32(cd7.val[1] , pSrc+200);
+ vldrev128q_f32(cd7.val[0] , pSrc+204);
+ vldrev128q_f32(cd6.val[1] , pSrc+208);
+ vldrev128q_f32(cd6.val[0] , pSrc+212);
+ vldrev128q_f32(cd5.val[1] , pSrc+216);
+ vldrev128q_f32(cd5.val[0] , pSrc+220);
+ vldrev128q_f32(cd4.val[1] , pSrc+224);
+ vldrev128q_f32(cd4.val[0] , pSrc+228);
+ vldrev128q_f32(cd3.val[1] , pSrc+232);
+ vldrev128q_f32(cd3.val[0] , pSrc+236);
+ vldrev128q_f32(cd2.val[1] , pSrc+240);
+ vldrev128q_f32(cd2.val[0] , pSrc+244);
+ vldrev128q_f32(cd1.val[1] , pSrc+248);
+ vldrev128q_f32(cd1.val[0] , pSrc+252);
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ //Transpose
+ float32x4_t temp;
+
+ temp = ab9.val[0];
+ ab9.val[0] = cd1.val[0];
+ cd1.val[0] = temp;
+ temp = ab9.val[1];
+ ab9.val[1] = cd1.val[1];
+ cd1.val[1] = temp;
+ temp = ab10.val[0];
+ ab10.val[0] = cd2.val[0];
+ cd2.val[0] = temp;
+ temp = ab10.val[1];
+ ab10.val[1] = cd2.val[1];
+ cd2.val[1] = temp;
+ temp = ab11.val[0];
+ ab11.val[0] = cd3.val[0];
+ cd3.val[0] = temp;
+ temp = ab11.val[1];
+ ab11.val[1] = cd3.val[1];
+ cd3.val[1] = temp;
+ temp = ab12.val[0];
+ ab12.val[0] = cd4.val[0];
+ cd4.val[0] = temp;
+ temp = ab12.val[1];
+ ab12.val[1] = cd4.val[1];
+ cd4.val[1] = temp;
+ temp = ab13.val[0];
+ ab13.val[0] = cd5.val[0];
+ cd5.val[0] = temp;
+ temp = ab13.val[1];
+ ab13.val[1] = cd5.val[1];
+ cd5.val[1] = temp;
+ temp = ab14.val[0];
+ ab14.val[0] = cd6.val[0];
+ cd6.val[0] = temp;
+ temp = ab14.val[1];
+ ab14.val[1] = cd6.val[1];
+ cd6.val[1] = temp;
+ temp = ab15.val[0];
+ ab15.val[0] = cd7.val[0];
+ cd7.val[0] = temp;
+ temp = ab15.val[1];
+ ab15.val[1] = cd7.val[1];
+ cd7.val[1] = temp;
+ temp = ab16.val[0];
+ ab16.val[0] = cd8.val[0];
+ cd8.val[0] = temp;
+ temp = ab16.val[1];
+ ab16.val[1] = cd8.val[1];
+ cd8.val[1] = temp;
+
+ //Compare
+ if(dir)
+ {
+ vminmax256q(ab1 , cd1 );
+ vminmax256q(ab2 , cd2 );
+ vminmax256q(ab3 , cd3 );
+ vminmax256q(ab4 , cd4 );
+ vminmax256q(ab5 , cd5 );
+ vminmax256q(ab6 , cd6 );
+ vminmax256q(ab7 , cd7 );
+ vminmax256q(ab8 , cd8 );
+ vminmax256q(ab9 , cd9 );
+ vminmax256q(ab10, cd10);
+ vminmax256q(ab11, cd11);
+ vminmax256q(ab12, cd12);
+ vminmax256q(ab13, cd13);
+ vminmax256q(ab14, cd14);
+ vminmax256q(ab15, cd15);
+ vminmax256q(ab16, cd16);
+ }
+ else
+ {
+ vminmax256q(cd1 , ab1 );
+ vminmax256q(cd2 , ab2 );
+ vminmax256q(cd3 , ab3 );
+ vminmax256q(cd4 , ab4 );
+ vminmax256q(cd5 , ab5 );
+ vminmax256q(cd6 , ab6 );
+ vminmax256q(cd7 , ab7 );
+ vminmax256q(cd8 , ab8 );
+ vminmax256q(cd9 , ab9 );
+ vminmax256q(cd10, ab10);
+ vminmax256q(cd11, ab11);
+ vminmax256q(cd12, ab12);
+ vminmax256q(cd13, ab13);
+ vminmax256q(cd14, ab14);
+ vminmax256q(cd15, ab15);
+ vminmax256q(cd16, ab16);
+ }
+
+ vst1q_f32(pSrc, ab1.val[0] );
+ vst1q_f32(pSrc+4, ab1.val[1] );
+ vst1q_f32(pSrc+8, ab2.val[0] );
+ vst1q_f32(pSrc+12, ab2.val[1] );
+ vst1q_f32(pSrc+16, ab3.val[0] );
+ vst1q_f32(pSrc+20, ab3.val[1] );
+ vst1q_f32(pSrc+24, ab4.val[0] );
+ vst1q_f32(pSrc+28, ab4.val[1] );
+ vst1q_f32(pSrc+32, ab5.val[0] );
+ vst1q_f32(pSrc+36, ab5.val[1] );
+ vst1q_f32(pSrc+40, ab6.val[0] );
+ vst1q_f32(pSrc+44, ab6.val[1] );
+ vst1q_f32(pSrc+48, ab7.val[0] );
+ vst1q_f32(pSrc+52, ab7.val[1] );
+ vst1q_f32(pSrc+56, ab8.val[0] );
+ vst1q_f32(pSrc+60, ab8.val[1] );
+ vst1q_f32(pSrc+64, cd1.val[0] );
+ vst1q_f32(pSrc+68, cd1.val[1] );
+ vst1q_f32(pSrc+72, cd2.val[0] );
+ vst1q_f32(pSrc+76, cd2.val[1] );
+ vst1q_f32(pSrc+80, cd3.val[0] );
+ vst1q_f32(pSrc+84, cd3.val[1] );
+ vst1q_f32(pSrc+88, cd4.val[0] );
+ vst1q_f32(pSrc+92, cd4.val[1] );
+ vst1q_f32(pSrc+96, cd5.val[0] );
+ vst1q_f32(pSrc+100, cd5.val[1] );
+ vst1q_f32(pSrc+104, cd6.val[0] );
+ vst1q_f32(pSrc+108, cd6.val[1] );
+ vst1q_f32(pSrc+112, cd7.val[0] );
+ vst1q_f32(pSrc+116, cd7.val[1] );
+ vst1q_f32(pSrc+120, cd8.val[0] );
+ vst1q_f32(pSrc+124, cd8.val[1] );
+ vst1q_f32(pSrc+128, ab9.val[0] );
+ vst1q_f32(pSrc+132, ab9.val[1] );
+ vst1q_f32(pSrc+136, ab10.val[0]);
+ vst1q_f32(pSrc+140, ab10.val[1]);
+ vst1q_f32(pSrc+144, ab11.val[0]);
+ vst1q_f32(pSrc+148, ab11.val[1]);
+ vst1q_f32(pSrc+152, ab12.val[0]);
+ vst1q_f32(pSrc+156, ab12.val[1]);
+ vst1q_f32(pSrc+160, ab13.val[0]);
+ vst1q_f32(pSrc+164, ab13.val[1]);
+ vst1q_f32(pSrc+168, ab14.val[0]);
+ vst1q_f32(pSrc+172, ab14.val[1]);
+ vst1q_f32(pSrc+176, ab15.val[0]);
+ vst1q_f32(pSrc+180, ab15.val[1]);
+ vst1q_f32(pSrc+184, ab16.val[0]);
+ vst1q_f32(pSrc+188, ab16.val[1]);
+ vst1q_f32(pSrc+192, cd9.val[0] );
+ vst1q_f32(pSrc+196, cd9.val[1] );
+ vst1q_f32(pSrc+200, cd10.val[0]);
+ vst1q_f32(pSrc+204, cd10.val[1]);
+ vst1q_f32(pSrc+208, cd11.val[0]);
+ vst1q_f32(pSrc+212, cd11.val[1]);
+ vst1q_f32(pSrc+216, cd12.val[0]);
+ vst1q_f32(pSrc+220, cd12.val[1]);
+ vst1q_f32(pSrc+224, cd13.val[0]);
+ vst1q_f32(pSrc+228, cd13.val[1]);
+ vst1q_f32(pSrc+232, cd14.val[0]);
+ vst1q_f32(pSrc+236, cd14.val[1]);
+ vst1q_f32(pSrc+240, cd15.val[0]);
+ vst1q_f32(pSrc+244, cd15.val[1]);
+ vst1q_f32(pSrc+248, cd16.val[0]);
+ vst1q_f32(pSrc+252, cd16.val[1]);
+
+ //Transpose
+ arm_bitonic_merge_128_f32(pSrc+0 , dir);
+ arm_bitonic_merge_128_f32(pSrc+128, dir);
+}
+
+#define SWAP(a,i,j) \
+ temp = vgetq_lane_f32(a, j); \
+ a = vsetq_lane_f32(vgetq_lane_f32(a, i), a, j);\
+ a = vsetq_lane_f32(temp, a, i);
+
+static float32x4_t arm_bitonic_sort_4_f32(float32x4_t a, uint8_t dir)
+{
+ float32_t temp;
+
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
+ {
+ SWAP(a,0,1);
+ }
+ if( dir==(vgetq_lane_f32(a, 2) > vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,2,3);
+ }
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,0,3);
+ }
+ if( dir==(vgetq_lane_f32(a, 1) > vgetq_lane_f32(a, 2)) )
+ {
+ SWAP(a,1,2);
+ }
+
+ if( dir==(vgetq_lane_f32(a, 0) > vgetq_lane_f32(a, 1)) )
+ {
+ SWAP(a,0,1);
+ }
+ if( dir==(vgetq_lane_f32(a, 2)>vgetq_lane_f32(a, 3)) )
+ {
+ SWAP(a,2,3);
+ }
+
+ return a;
+}
+
+static float32x4x2_t arm_bitonic_sort_8_f32(float32x4_t a, float32x4_t b, uint8_t dir)
+{
+ a = arm_bitonic_sort_4_f32(a, dir);
+ b = arm_bitonic_sort_4_f32(b, dir);
+ return arm_bitonic_merge_8_f32(a, b, dir);
+}
+
+
+
+#endif
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Sorting Vector sorting algorithms
+
+ Sort the elements of a vector
+
+ There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ */
+void arm_bitonic_sort_f32(
+const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint16_t s, i;
+ uint8_t dir = S->dir;
+
+#ifdef ARM_MATH_NEON
+ (void)s;
+
+ float32_t * pOut;
+ uint16_t counter = blockSize>>5;
+
+ if( (blockSize & (blockSize-1)) == 0 ) // Powers of 2 only
+ {
+ if(pSrc == pDst) // in-place
+ pOut = pSrc;
+ else
+ pOut = pDst;
+
+ float32x4x2_t ab1, ab2;
+ float32x4x2_t cd1, cd2;
+
+ if(blockSize == 1)
+ pOut = pSrc;
+ else if(blockSize == 2)
+ {
+ float32_t temp;
+
+ if( dir==(pSrc[0]>pSrc[1]) )
+ {
+ temp = pSrc[1];
+ pOut[1] = pSrc[0];
+ pOut[0] = temp;
+ }
+ else
+ pOut = pSrc;
+ }
+ else if(blockSize == 4)
+ {
+ float32x4_t a = vld1q_f32(pSrc);
+
+ a = arm_bitonic_sort_4_f32(a, dir);
+
+ vst1q_f32(pOut, a);
+ }
+ else if(blockSize == 8)
+ {
+ float32x4_t a;
+ float32x4_t b;
+ float32x4x2_t ab;
+
+ a = vld1q_f32(pSrc);
+ b = vld1q_f32(pSrc+4);
+
+ ab = arm_bitonic_sort_8_f32(a, b, dir);
+
+ vst1q_f32(pOut, ab.val[0]);
+ vst1q_f32(pOut+4, ab.val[1]);
+ }
+ else if(blockSize >=16)
+ {
+ // Order 16 bits long vectors
+ for(i=0; i>1;
+ for(i=0; i>1;
+ for(i=0; i>1;
+ for(i=0; idir;
+ uint32_t i;
+ uint8_t swapped =1;
+ float32_t * pA;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ while(swapped==1) // If nothing has been swapped after one loop stop
+ {
+ swapped=0;
+
+ for(i=0; ipA[i+1]))
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[i+1];
+ pA[i+1] = temp;
+
+ // Update flag
+ swapped = 1;
+ }
+ }
+
+ blockSize--;
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
new file mode 100644
index 0000000000..82db245c69
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f16.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_f16.c
+ * Description: Copies the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a f16 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f16(
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vstrhq_p_f16(pDst,
+ vldrhq_z_f16((float16_t const *) pSrc, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_copy_f16(
+ const float16_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of BasicCopy group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
new file mode 100644
index 0000000000..8df7e8b250
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_f32.c
@@ -0,0 +1,192 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_f32.c
+ * Description: Copies the elements of a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup copy Vector Copy
+
+ Copies sample by sample from source vector to destination vector.
+
+
+ pDst[n] = pSrc[n]; 0 <= n < blockSize.
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a floating-point vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+ vstrwq_f32(pDst, vldrwq_f32(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ inV = vld1q_f32(pSrc);
+ vst1q_f32(pDst, inV);
+ pSrc += 4;
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_copy_f32(
+ const float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
new file mode 100644
index 0000000000..c657b54186
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q15.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q15.c
+ * Description: Copies the elements of a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q15 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ vstrhq_s16(pDst,vldrhq_s16(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_copy_q15(
+ const q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* read 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc));
+ write_q15x2_ia (&pDst, read_q15x2_ia ((q15_t **) &pSrc));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
new file mode 100644
index 0000000000..0e2ae43f3e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q31.c
@@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q31.c
+ * Description: Copies the elements of a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q31 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+ vstrwq_s32(pDst,vldrwq_s32(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+void arm_copy_q31(
+ const q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
new file mode 100644
index 0000000000..150f1986f0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_copy_q7.c
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_copy_q7.c
+ * Description: Copies the elements of a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup copy
+ @{
+ */
+
+/**
+ @brief Copies the elements of a Q7 vector.
+ @param[in] pSrc points to input vector
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_copy_q7(
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+
+ vstrbq_s8(pDst,vldrbq_s8(pSrc));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 16;
+ pDst += 16;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_copy_q7(
+ const q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* read 4 samples at a time */
+ write_q7x4_ia (&pDst, read_q7x4_ia ((q7_t **) &pSrc));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A */
+
+ /* Copy and store result in destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of BasicCopy group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
new file mode 100644
index 0000000000..c159dad4d5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_float.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup f16_to_x Convert 16-bit floating point value
+ */
+
+/**
+ @addtogroup f16_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the f16 vector to f32 vector.
+ @param[in] pSrc points to the f16 input vector
+ @param[out] pDst points to the f32 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_f16_to_float built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+void arm_f16_to_float(
+ const float16_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ float16x8_t vecDst;
+ float32x4x2_t tmp;
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ vecDst = vldrhq_f16(pSrc);
+ pSrc += 8;
+
+ tmp.val[0] = vcvtbq_f32_f16(vecDst);
+ tmp.val[1] = vcvttq_f32_f16(vecDst);
+ vst2q(pDst,tmp);
+
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0)
+ {
+
+ *pDst++ = (float32_t) *pSrc++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_f16_to_float(
+ const float16_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const float16_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+ *pDst++ = (float32_t) * pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
new file mode 100644
index 0000000000..8931dbbeeb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_f16_to_q15.c
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup f16_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the f16 vector to Q15 vector.
+ @param[in] pSrc points to the f16 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize.
+
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+ @note
+ In order to apply rounding in scalar version, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_f16_to_q15(
+ const float16_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ float16_t maxQ = (float16_t) Q15_MAX;
+ float16x8_t vecDst;
+
+
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vecDst = vldrhq_z_f16((float16_t const *) pSrc, p);
+ /* C = A * 32767 */
+ /* convert from float to Q15 and then store the results in the destination buffer */
+ vecDst = vmulq_m(vuninitializedq_f16(), vecDst, maxQ, p);
+
+ vstrhq_p_s16(pDst,
+ vcvtaq_m(vuninitializedq_s16(), vecDst, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 8;
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+
+#else
+
+void arm_f16_to_q15(
+ const float16_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ const float16_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+#ifdef ARM_MATH_ROUNDING
+ float16_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /*
+ * C = A * 65536
+ */
+ /*
+ * convert from float to Q31 and then store the results in the destination buffer
+ */
+ in = *pIn++;
+ in = (in * 32768.0);
+ in += in > 0.0 ? 0.5 : -0.5;
+ *pDst++ = clip_q31_to_q15((q31_t) (in));
+
+#else
+
+ /*
+ * C = A * 32768
+ */
+ /*
+ * convert from float to Q31 and then store the results in the destination buffer
+ */
+ *pDst++ = clip_q31_to_q15((q31_t) (*pIn++ * 32768.0));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of f16_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
new file mode 100644
index 0000000000..f359c6de1e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f16.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_f16.c
+ * Description: Fills a constant value into a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a f16 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_fill_f16(
+ float16_t value,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ do {
+ mve_pred16_t p = vctp16q(blockSize);
+
+ vstrhq_p_f16(pDst,
+ vdupq_m_n_f16(vuninitializedq_f16(), value, p), p);
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 8;
+ blockSize -= 8;
+ }
+ while ((int32_t) blockSize > 0);
+}
+#else
+void arm_fill_f16(
+ float16_t value,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Fill group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
new file mode 100644
index 0000000000..50cdd8fc78
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_f32.c
@@ -0,0 +1,189 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_f32.c
+ * Description: Fills a constant value into a floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup Fill Vector Fill
+
+ Fills the destination vector with a constant value.
+
+
+ pDst[n] = value; 0 <= n < blockSize.
+
+
+ There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a floating-point vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+
+ vstrwq_f32(pDst,vdupq_n_f32(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+
+ float32x4_t inV = vdupq_n_f32(value);
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ vst1q_f32(pDst, inV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
new file mode 100644
index 0000000000..964c09fc0e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q15.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q15.c
+ * Description: Fills a constant value into a Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q15 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q15(
+ q15_t value,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+
+ vstrhq_s16(pDst,vdupq_n_s16(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 8;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_fill_q15(
+ q15_t value,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t packedValue; /* value packed to 32 bits */
+
+ /* Packing two 16 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PKHBT(value, value, 16U);
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* fill 2 times 2 samples at a time */
+ write_q15x2_ia (&pDst, packedValue);
+ write_q15x2_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
new file mode 100644
index 0000000000..50e7b08a87
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q31.c
@@ -0,0 +1,135 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q31.c
+ * Description: Fills a constant value into a Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q31 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q31(
+ q31_t value,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time */
+ while (blkCnt > 0U)
+ {
+
+ vstrwq_s32(pDst,vdupq_n_s32(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+
+#else
+void arm_fill_q31(
+ q31_t value,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
new file mode 100644
index 0000000000..00d0a25c99
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_fill_q7.c
@@ -0,0 +1,133 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_fill_q7.c
+ * Description: Fills a constant value into a Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Fill
+ @{
+ */
+
+/**
+ @brief Fills a constant value into a Q7 vector.
+ @param[in] value input value to be filled
+ @param[out] pDst points to output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_fill_q7(
+ q7_t value,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+
+ vstrbq_s8(pDst,vdupq_n_s8(value));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 16;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_fill_q7(
+ q7_t value,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t packedValue; /* value packed to 32 bits */
+
+ /* Packing four 8 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PACKq7(value, value, value, value);
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* fill 4 samples at a time */
+ write_q7x4_ia (&pDst, packedValue);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = value */
+
+ /* Fill value in destination buffer */
+ *pDst++ = value;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of Fill group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
new file mode 100644
index 0000000000..ca4f477ffc
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_f16.c
@@ -0,0 +1,131 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to f16 vector.
+ @param[in] pSrc points to the f32 input vector
+ @param[out] pDst points to the f16 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && defined(__CMSIS_GCC_H)
+#pragma GCC warning "Scalar version of arm_float_to_f16 built. Helium version has build issues with gcc."
+#endif
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) && !defined(__CMSIS_GCC_H)
+
+void arm_float_to_f16(
+ const float32_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ float32x4x2_t tmp;
+ float16x8_t vecDst;
+ float32_t const *pSrcVec;
+
+
+ pSrcVec = (float32_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ /* convert from float32 to float16 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec); pSrcVec += 8;
+ /* narrow / merge */
+ vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+ vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+ vst1q(pDst, vecDst); pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ tmp = vld2q(pSrcVec);
+ vecDst = vcvtbq_f16_f32(vecDst, tmp.val[0]);
+ vecDst = vcvttq_f16_f32(vecDst, tmp.val[1]);
+ vstrhq_p(pDst, vecDst, p0);
+ }
+}
+
+#else
+
+void arm_float_to_f16(
+ const float32_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ /*
+ * Loop over blockSize number of values
+ */
+ blkCnt = blockSize;
+
+ while (blkCnt > 0U)
+ {
+
+ *pDst++ = (float16_t) * pIn++;
+ /*
+ * Decrement the loop counter
+ */
+ blkCnt--;
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
new file mode 100644
index 0000000000..41d8d58cf9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q15.c
@@ -0,0 +1,308 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q15.c
+ * Description: Converts the elements of the floating-point vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to Q15 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q15_t)(pSrc[n] * 32768); 0 <= n < blockSize.
+
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q15 range [0x8000 0x7FFF] are saturated.
+
+ @note
+ In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t maxQ = (float32_t) Q15_MAX;
+ f32x4x2_t tmp;
+ q15x8_t vecDst;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ tmp = vld2q(pSrc);
+
+ tmp.val[0] = vmulq(tmp.val[0], maxQ);
+ tmp.val[1] = vmulq(tmp.val[1], maxQ);
+
+ vecDst = vqmovnbq(vecDst, vcvtaq_s32_f32(tmp.val[0]));
+ vecDst = vqmovntq(vecDst, vcvtaq_s32_f32(tmp.val[1]));
+ vst1q(pDst, vecDst);
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ pDst += 8;
+ pSrc += 8;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrc++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pSrc++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 32768.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 32768.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ float32_t in;
+ #endif
+
+ int32x4_t cvt;
+ int16x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+
+ pIn += 4;
+
+ cvt = vcvtq_n_s32_f32(inV,15);
+ outV = vqmovn_s32(cvt);
+
+ vst1_s16(pDst, outV);
+ pDst += 4;
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+
+ cvt = vcvtq_n_s32_f32(inV,15);
+ outV = vqmovn_s32(cvt);
+
+ vst1_s16(pDst, outV);
+ pDst += 4;
+ pIn += 4;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_float_to_q15(
+ const float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 32768 */
+
+ /* convert from float to Q15 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 32768.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* Convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
new file mode 100644
index 0000000000..a5a4fe43cf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q31.c
@@ -0,0 +1,314 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q31.c
+ * Description: Converts the elements of the floating-point vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup float_to_x Convert 32-bit floating point value
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the floating-point vector to Q31 vector.
+ @param[in] pSrc points to the floating-point input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q31_t)(pSrc[n] * 2147483648); 0 <= n < blockSize.
+
+
+ @par Scaling and Overflow Behavior
+ The function uses saturating arithmetic.
+ Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] are saturated.
+
+ @note
+ In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ defined in the preprocessor section of project options.
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ float32_t maxQ = (float32_t) Q31_MAX;
+ f32x4_t vecDst;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time. */
+ while (blkCnt > 0U)
+ {
+
+ vecDst = vldrwq_f32(pSrc);
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ vecDst = vmulq(vecDst, maxQ);
+
+ vstrwq_s32(pDst, vcvtaq_s32_f32(vecDst));
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pSrc += 4;
+ pDst += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrc++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pSrc++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32_t in;
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 2147483648.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 2147483648.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ #endif
+
+ int32x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 32768 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+
+ pIn += 4;
+
+ outV = vcvtq_n_s32_f32(inV,31);
+
+ vst1q_s32(pDst, outV);
+ pDst += 4;
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+
+ outV = vcvtq_n_s32_f32(inV,31);
+
+ vst1q_s32(pDst, outV);
+ pDst += 4;
+ pIn += 4;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+}
+#else
+void arm_float_to_q31(
+ const float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 2147483648 */
+
+ /* convert from float to Q31 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 2147483648.0f);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* Convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
new file mode 100644
index 0000000000..a85ba639cf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_float_to_q7.c
@@ -0,0 +1,330 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_float_to_q7.c
+ * Description: Converts the elements of the floating-point vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup float_to_x
+ @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q7 vector.
+ * @param[in] *pSrc points to the floating-point input vector
+ * @param[out] *pDst points to the Q7 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ *\par Description:
+ * \par
+ * The equation used for the conversion process is:
+ *
+ * pDst[n] = (q7_t)(pSrc[n] * 128); 0 <= n < blockSize.
+ *
+ * \par Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ * \note
+ * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ float32_t maxQ = powf(2.0, 7);
+ f32x4x4_t tmp;
+ q15x8_t evVec, oddVec;
+ q7x16_t vecDst;
+ float32_t const *pSrcVec;
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif
+
+ pSrcVec = (float32_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U) {
+ tmp = vld4q(pSrcVec);
+ pSrcVec += 16;
+ /*
+ * C = A * 128.0
+ * convert from float to q7 and then store the results in the destination buffer
+ */
+ tmp.val[0] = vmulq(tmp.val[0], maxQ);
+ tmp.val[1] = vmulq(tmp.val[1], maxQ);
+ tmp.val[2] = vmulq(tmp.val[2], maxQ);
+ tmp.val[3] = vmulq(tmp.val[3], maxQ);
+
+ /*
+ * convert and pack evens
+ */
+ evVec = vqmovnbq(evVec, vcvtaq_s32_f32(tmp.val[0]));
+ evVec = vqmovntq(evVec, vcvtaq_s32_f32(tmp.val[2]));
+ /*
+ * convert and pack odds
+ */
+ oddVec = vqmovnbq(oddVec, vcvtaq_s32_f32(tmp.val[1]));
+ oddVec = vqmovntq(oddVec, vcvtaq_s32_f32(tmp.val[3]));
+ /*
+ * merge
+ */
+ vecDst = vqmovnbq(vecDst, evVec);
+ vecDst = vqmovntq(vecDst, oddVec);
+
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pSrcVec++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = (q7_t) __SSAT((q31_t) (*pSrcVec++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ const float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ float32x4_t inV;
+ #ifdef ARM_MATH_ROUNDING
+ float32_t in;
+ float32x4_t zeroV = vdupq_n_f32(0.0f);
+ float32x4_t pHalf = vdupq_n_f32(0.5f / 128.0f);
+ float32x4_t mHalf = vdupq_n_f32(-0.5f / 128.0f);
+ float32x4_t r;
+ uint32x4_t cmp;
+ #endif
+
+ int16x4_t cvt1,cvt2;
+ int8x8_t outV;
+
+ blkCnt = blockSize >> 3U;
+
+ /* Compute 8 outputs at a time.
+ ** a second loop below computes the remaining 1 to 7 samples. */
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+ cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ inV = vld1q_f32(pIn);
+ cmp = vcgtq_f32(inV,zeroV);
+ r = vbslq_f32(cmp,pHalf,mHalf);
+ inV = vaddq_f32(inV, r);
+ cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+ vst1_s8(pDst, outV);
+ pDst += 8;
+
+#else
+
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ inV = vld1q_f32(pIn);
+ cvt1 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ inV = vld1q_f32(pIn);
+ cvt2 = vqmovn_s32(vcvtq_n_s32_f32(inV,7));
+ pIn += 4;
+
+ outV = vqmovn_s16(vcombine_s16(cvt1,cvt2));
+
+ vst1_s8(pDst, outV);
+ pDst += 8;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 7;
+
+ while (blkCnt > 0U)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ /* C = A * 128 */
+ /* Convert from float to q7 and then store the results in the destination buffer */
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+void arm_float_to_q7(
+ const float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const float32_t *pIn = pSrc; /* Source pointer */
+
+#ifdef ARM_MATH_ROUNDING
+ float32_t in;
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = A * 128 */
+
+ /* Convert from float to q7 and store result in destination buffer */
+#ifdef ARM_MATH_ROUNDING
+
+ in = (*pIn++ * 128);
+ in += in > 0.0f ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of float_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
new file mode 100644
index 0000000000..b5173d2e78
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_heap_sort_f32.c
@@ -0,0 +1,119 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_heap_sort_f32.c
+ * Description: Floating point heap sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+
+
+static void arm_heapify(float32_t * pSrc, uint32_t n, uint32_t i, uint8_t dir)
+{
+ /* Put all the elements of pSrc in heap order */
+ uint32_t k = i; // Initialize largest/smallest as root
+ uint32_t l = 2*i + 1; // left = 2*i + 1
+ uint32_t r = 2*i + 2; // right = 2*i + 2
+ float32_t temp;
+
+ if (l < n && dir==(pSrc[l] > pSrc[k]) )
+ k = l;
+
+ if (r < n && dir==(pSrc[r] > pSrc[k]) )
+ k = r;
+
+ if (k != i)
+ {
+ temp = pSrc[i];
+ pSrc[i]=pSrc[k];
+ pSrc[k]=temp;
+
+ arm_heapify(pSrc, n, k, dir);
+ }
+}
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The heap sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted region,
+ * and shrinks the unsorted region by extracting the largest
+ * element and moving it to the sorted region. A heap data
+ * structure is used to find the maximum.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+void arm_heap_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ int32_t i;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Build the heap array so that the largest value is the root
+ for (i = blockSize/2 - 1; i >= 0; i--)
+ arm_heapify(pA, blockSize, i, S->dir);
+
+ for (i = blockSize - 1; i >= 0; i--)
+ {
+ // Swap
+ temp = pA[i];
+ pA[i] = pA[0];
+ pA[0] = temp;
+
+ // Restore heap order
+ arm_heapify(pA, i, 0, S->dir);
+ }
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
new file mode 100644
index 0000000000..91c0fc7bdd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_insertion_sort_f32.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_insertion_sort_f32.c
+ * Description: Floating point insertion sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The insertion sort is a simple sorting algorithm that
+ * reads all the element of the input array and removes one element
+ * at a time, finds the location it belongs in the final sorted list,
+ * and inserts it there.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_insertion_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t* pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+ uint8_t dir = S->dir;
+ uint32_t i, j;
+ float32_t temp;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ // Real all the element of the input array
+ for(i=0; i0 && dir==(pA[j]= end || dir==(pA[i] <= pA[j])) )
+ {
+ pB[k] = pA[i];
+ i++;
+ }
+ else
+ {
+ pB[k] = pA[j];
+ j++;
+ }
+ }
+}
+
+static void arm_merge_sort_core_f32(float32_t * pB, uint32_t begin, uint32_t end, float32_t * pA, uint8_t dir)
+{
+ if((int32_t)end - (int32_t)begin >= 2 ) // If run size != 1 divide
+ {
+ int32_t middle = (end + begin) / 2; // Take the middle point
+
+ arm_merge_sort_core_f32(pA, begin, middle, pB, dir); // Sort the left part
+ arm_merge_sort_core_f32(pA, middle, end, pB, dir); // Sort the right part
+
+ topDownMerge(pB, begin, middle, end, pA, dir);
+ }
+}
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The merge sort algorithm is a comparison algorithm that
+ * divide the input array in sublists and merge them to produce
+ * longer sorted sublists until there is only one list remaining.
+ *
+ * @par A work array is always needed. It must be allocated by the user
+ * linked to the instance at initialization time.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed
+ */
+
+
+void arm_merge_sort_f32(
+ const arm_merge_sort_instance_f32 * S,
+ float32_t *pSrc,
+ float32_t *pDst,
+ uint32_t blockSize)
+{
+ float32_t * pA;
+
+ /* Out-of-place */
+ if(pSrc != pDst)
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t));
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ /* A working buffer is needed */
+ memcpy(S->buffer, pSrc, blockSize*sizeof(float32_t));
+
+ arm_merge_sort_core_f32(S->buffer, 0, blockSize, pA, S->dir);
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
new file mode 100644
index 0000000000..bd93a00fb3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_merge_sort_init_f32.c
@@ -0,0 +1,53 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_merge_sort_init_f32.c
+ * Description: Floating point merge sort initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+
+ /**
+ * @param[in,out] S points to an instance of the sorting structure.
+ * @param[in] dir Sorting order.
+ * @param[in] buffer Working buffer.
+ */
+void arm_merge_sort_init_f32(arm_merge_sort_instance_f32 * S, arm_sort_dir dir, float32_t * buffer)
+{
+ S->dir = dir;
+ S->buffer = buffer;
+}
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
new file mode 100644
index 0000000000..00a919ad24
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_f16.c
@@ -0,0 +1,155 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_float.c
+ * Description: Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x Convert 16-bit Integer value
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to f16 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the f16 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (float16_t) pSrc[n] / 32768; 0 <= n < blockSize.
+
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_q15_to_f16(
+ const q15_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ int32_t blkCnt; /* loop counters */
+ q15x8_t vecDst;
+ q15_t const *pSrcVec;
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ /* C = (float16_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ vecDst = vld1q(pSrcVec); pSrcVec += 8;
+ vstrhq(pDst, vcvtq_n_f16_s16(vecDst, 15)); pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 7;
+ if (blkCnt > 0)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+ vecDst = vld1q(pSrcVec); pSrcVec += 8;
+ vstrhq_p(pDst, vcvtq_n_f16_s16(vecDst, 15), p0);
+ }
+}
+#else
+
+void arm_q15_to_f16(
+ const q15_t * pSrc,
+ float16_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float16_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float16_t) * pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float16_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float16_t) *pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q15_to_x group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
new file mode 100644
index 0000000000..a7966e2392
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_float.c
@@ -0,0 +1,207 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_float.c
+ * Description: Converts the elements of the Q15 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x Convert 16-bit Integer value
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to floating-point vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (float32_t) pSrc[n] / 32768; 0 <= n < blockSize.
+
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+
+ q15x8_t vecDst;
+ q15_t const *pSrcVec;
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ vecDst = vldrhq_s32(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 15));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pSrcVec++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q15_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int16x8_t inV;
+ int32x4_t inV0, inV1;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 3U;
+
+ /* Compute 8 outputs at a time.
+ ** a second loop below computes the remaining 1 to 7 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ inV = vld1q_s16(pIn);
+ pIn += 8;
+
+ inV0 = vmovl_s16(vget_low_s16(inV));
+ inV1 = vmovl_s16(vget_high_s16(inV));
+
+ outV = vcvtq_n_f32_s32(inV0,15);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inV1,15);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 7;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q15 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_float(
+ const q15_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+ *pDst++ = ((float32_t) * pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+
+ /* Convert from q15 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 32768.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
new file mode 100644
index 0000000000..468b997b29
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q31.c
@@ -0,0 +1,182 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_q31.c
+ * Description: Converts the elements of the Q15 vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to Q31 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q31_t) pSrc[n] << 16; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q31(
+ const q15_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt;
+
+ q31x4_t vecDst;
+
+ blkCnt = blockSize>> 2;
+ while (blkCnt > 0U)
+ {
+
+ /* C = (q31_t)A << 16 */
+ /* convert from q15 to q31 and then store the results in the destination buffer */
+ /* load q15 + 32-bit widening */
+ vecDst = vldrhq_s32((q15_t const *) pSrc);
+ vecDst = vshlq_n(vecDst, 16);
+ vstrwq_s32(pDst, vecDst);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ pSrc += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pSrc++ << 16;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_q31(
+ const q15_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+ q31_t in1, in2;
+ q31_t out1, out2, out3, out4;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t)A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ in1 = read_q15x2_ia ((q15_t **) &pIn);
+ in2 = read_q15x2_ia ((q15_t **) &pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ /* extract lower 16 bits to 32 bit result */
+ out1 = in1 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out2 = in1 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out3 = in2 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out4 = in2 & 0xFFFF0000;
+
+#else
+
+ /* extract upper 16 bits to 32 bit result */
+ out1 = in1 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out2 = in1 << 16U;
+ /* extract upper 16 bits to 32 bit result */
+ out3 = in2 & 0xFFFF0000;
+ /* extract lower 16 bits to 32 bit result */
+ out4 = in2 << 16U;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ *pDst++ = out1;
+ *pDst++ = out2;
+ *pDst++ = out3;
+ *pDst++ = out4;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 16 */
+
+ /* Convert from q15 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pIn++ << 16;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
new file mode 100644
index 0000000000..a5abca8fe3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q15_to_q7.c
@@ -0,0 +1,190 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q15_to_q7.c
+ * Description: Converts the elements of the Q15 vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q15_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q15 vector to Q7 vector.
+ @param[in] pSrc points to the Q15 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q7_t) pSrc[n] >> 8; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q15_to_q7(
+ const q15_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt; /* loop counters */
+ q15x8x2_t tmp;
+ q15_t const *pSrcVec;
+ q7x16_t vecDst;
+
+
+ pSrcVec = (q15_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+ /* convert from q15 to q7 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec);
+ pSrcVec += 16;
+ vecDst = vqshrnbq_n_s16(vecDst, tmp.val[0], 8);
+ vecDst = vqshrntq_n_s16(vecDst, tmp.val[1], 8);
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pSrcVec++ >> 8);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q15_to_q7(
+ const q15_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q15_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in1, in2;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in1 = read_q15x2_ia ((q15_t **) &pIn);
+ in2 = read_q15x2_ia ((q15_t **) &pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __PKHTB(in2, in1, 16);
+ out2 = __PKHBT(in2, in1, 16);
+
+#else
+
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHBT(in1, in2, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* rotate packed value by 24 */
+ out2 = ((uint32_t) out2 << 8) | ((uint32_t) out2 >> 24);
+
+ /* anding with 0xff00ff00 to get two 8 bit values */
+ out1 = out1 & 0xFF00FF00;
+ /* anding with 0x00ff00ff to get two 8 bit values */
+ out2 = out2 & 0x00FF00FF;
+
+ /* oring two values(contains two 8 bit values) to get four packed 8 bit values */
+ out1 = out1 | out2;
+
+ /* store 4 samples at a time to destiantion buffer */
+ write_q7x4_ia (&pDst, out1);
+
+#else
+
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) A >> 8 */
+
+ /* Convert from q15 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pIn++ >> 8);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q15_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
new file mode 100644
index 0000000000..30d5eed683
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_float.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_float.c
+ * Description: Converts the elements of the Q31 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q31_to_x Convert 32-bit Integer value
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to floating-point vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (float32_t) pSrc[n] / 2147483648; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4_t vecDst;
+ q31_t const *pSrcVec;
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* convert from q31 to float and then store the results in the destination buffer */
+ vecDst = vld1q(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 31));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pSrcVec++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+#if defined(ARM_MATH_NEON_EXPERIMENTAL)
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q31_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int32x4_t inV;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 2U;
+
+ /* Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* Convert from q31 to float and then store the results in the destination buffer */
+ inV = vld1q_s32(pIn);
+ pIn += 4;
+
+ outV = vcvtq_n_f32_s32(inV,31);
+
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 3;
+
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+ /* Convert from q31 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 2147483648.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q31_to_float(
+ const q31_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q31_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 2147483648 */
+
+ /* Convert from q31 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) *pIn++ / 2147483648.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
new file mode 100644
index 0000000000..aa5b0a0496
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q15.c
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_q15.c
+ * Description: Converts the elements of the Q31 vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to Q15 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q15_t) pSrc[n] >> 16; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q15(
+ const q31_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4x2_t tmp;
+ q15x8_t vecDst;
+ q31_t const *pSrcVec;
+
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A >> 16 */
+ /* convert from q31 to q15 and then store the results in the destination buffer */
+ tmp = vld2q(pSrcVec);
+ pSrcVec += 8;
+ vecDst = vshrnbq_n_s32(vecDst, tmp.val[0], 16);
+ vecDst = vshrntq_n_s32(vecDst, tmp.val[1], 16);
+ vst1q(pDst, vecDst);
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) (*pSrcVec++ >> 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_q31_to_q15(
+ const q31_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q31_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in1, in2, in3, in4;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in1 = *pIn++;
+ in2 = *pIn++;
+ in3 = *pIn++;
+ in4 = *pIn++;
+
+ /* pack two higher 16-bit values from two 32-bit values */
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __PKHTB(in2, in1, 16);
+ out2 = __PKHTB(in4, in3, 16);
+#else
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHTB(in3, in4, 16);
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2_ia (&pDst, out1);
+ write_q15x2_ia (&pDst, out2);
+
+#else
+
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) (A >> 16) */
+
+ /* Convert from q31 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) (*pIn++ >> 16);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
new file mode 100644
index 0000000000..f9b005635a
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q31_to_q7.c
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q31_to_q7.c
+ * Description: Converts the elements of the Q31 vector to Q7 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q31_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q31 vector to Q7 vector.
+ @param[in] pSrc points to the Q31 input vector
+ @param[out] pDst points to the Q7 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q7_t) pSrc[n] >> 24; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q31_to_q7(
+ const q31_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q31x4x4_t tmp;
+ q15x8_t evVec, oddVec;
+ q7x16_t vecDst;
+ q31_t const *pSrcVec;
+
+ pSrcVec = (q31_t const *) pSrc;
+ blkCnt = blockSize >> 4;
+ while (blkCnt > 0U)
+ {
+ tmp = vld4q(pSrcVec);
+ pSrcVec += 16;
+ /* C = (q7_t) A >> 24 */
+ /* convert from q31 to q7 and then store the results in the destination buffer */
+ /*
+ * narrow and pack evens
+ */
+ evVec = vshrnbq_n_s32(evVec, tmp.val[0], 16);
+ evVec = vshrntq_n_s32(evVec, tmp.val[2], 16);
+ /*
+ * narrow and pack odds
+ */
+ oddVec = vshrnbq_n_s32(oddVec, tmp.val[1], 16);
+ oddVec = vshrntq_n_s32(oddVec, tmp.val[3], 16);
+ /*
+ * narrow & merge
+ */
+ vecDst = vshrnbq_n_s16(vecDst, evVec, 8);
+ vecDst = vshrntq_n_s16(vecDst, oddVec, 8);
+
+ vst1q(pDst, vecDst);
+ pDst += 16;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+ /*
+ * tail
+ */
+ blkCnt = blockSize & 0xF;
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pSrcVec++ >> 24);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q31_to_q7(
+ const q31_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q31_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ q7_t out1, out2, out3, out4;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+
+ out1 = (q7_t) (*pIn++ >> 24);
+ out2 = (q7_t) (*pIn++ >> 24);
+ out3 = (q7_t) (*pIn++ >> 24);
+ out4 = (q7_t) (*pIn++ >> 24);
+ write_q7x4_ia (&pDst, __PACKq7(out1, out2, out3, out4));
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q7_t) (A >> 24) */
+
+ /* Convert from q31 to q7 and store result in destination buffer */
+ *pDst++ = (q7_t) (*pIn++ >> 24);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q31_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
new file mode 100644
index 0000000000..4846fdec42
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_float.c
@@ -0,0 +1,218 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_float.c
+ * Description: Converts the elements of the Q7 vector to floating-point vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q7_to_x Convert 8-bit Integer value
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to floating-point vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the floating-point output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (float32_t) pSrc[n] / 128; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counters */
+ q7x16_t vecDst;
+ q7_t const *pSrcVec;
+
+ pSrcVec = (q7_t const *) pSrc;
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 32768 */
+ /* convert from q7 to float and then store the results in the destination buffer */
+ vecDst = vldrbq_s32(pSrcVec);
+ pSrcVec += 4;
+ vstrwq(pDst, vcvtq_n_f32_s32(vecDst, 7));
+ pDst += 4;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pSrcVec++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+#else
+#if defined(ARM_MATH_NEON)
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ const q7_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+ int8x16_t inV;
+ int16x8_t inVLO, inVHI;
+ int32x4_t inVLL, inVLH, inVHL, inVHH;
+ float32x4_t outV;
+
+ blkCnt = blockSize >> 4U;
+
+ /* Compute 16 outputs at a time.
+ ** a second loop below computes the remaining 1 to 15 samples. */
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+ /* Convert from q7 to float and then store the results in the destination buffer */
+ inV = vld1q_s8(pIn);
+ pIn += 16;
+
+ inVLO = vmovl_s8(vget_low_s8(inV));
+ inVHI = vmovl_s8(vget_high_s8(inV));
+
+ inVLL = vmovl_s16(vget_low_s16(inVLO));
+ inVLH = vmovl_s16(vget_high_s16(inVLO));
+ inVHL = vmovl_s16(vget_low_s16(inVHI));
+ inVHH = vmovl_s16(vget_high_s16(inVHI));
+
+ outV = vcvtq_n_f32_s32(inVLL,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVLH,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVHL,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ outV = vcvtq_n_f32_s32(inVHH,7);
+ vst1q_f32(pDst, outV);
+ pDst += 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 16, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize & 0xF;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+ /* Convert from q7 to float and then store the results in the destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+#else
+void arm_q7_to_float(
+ const q7_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (float32_t) A / 128 */
+
+ /* Convert from q7 to float and store result in destination buffer */
+ *pDst++ = ((float32_t) * pIn++ / 128.0f);
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* #if defined(ARM_MATH_NEON) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
new file mode 100644
index 0000000000..4985ccb0a5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q15.c
@@ -0,0 +1,188 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_q15.c
+ * Description: Converts the elements of the Q7 vector to Q15 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to Q15 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q15 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q15_t) pSrc[n] << 8; 0 <= n < blockSize.
+
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q15(
+ const q7_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+
+ uint32_t blkCnt; /* loop counters */
+ q15x8_t vecDst;
+ q7_t const *pSrcVec;
+
+
+ pSrcVec = (q7_t const *) pSrc;
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+ /* convert from q7 to q15 and then store the results in the destination buffer */
+ /* load q7 + 32-bit widening */
+ vecDst = vldrbq_s16(pSrcVec);
+ pSrcVec += 8;
+ vecDst = vecDst << 8;
+ vstrhq(pDst, vecDst);
+ pDst += 8;
+ /*
+ * Decrement the blockSize loop counter
+ */
+ blkCnt--;
+ }
+
+ blkCnt = blockSize & 7;
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) * pSrcVec++ << 8;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#else
+void arm_q7_to_q15(
+ const q7_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
+ q31_t in;
+ q31_t in1, in2;
+ q31_t out1, out2;
+#endif
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+#if defined (ARM_MATH_DSP)
+
+ in = read_q7x4_ia ((q7_t **) &pIn);
+
+ /* rotatate in by 8 and extend two q7_t values to q15_t values */
+ in1 = __SXTB16(__ROR(in, 8));
+
+ /* extend remainig two q7_t values to q15_t values */
+ in2 = __SXTB16(in);
+
+ in1 = in1 << 8U;
+ in2 = in2 << 8U;
+
+ in1 = in1 & 0xFF00FF00;
+ in2 = in2 & 0xFF00FF00;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out2 = __PKHTB(in1, in2, 16);
+ out1 = __PKHBT(in2, in1, 16);
+#else
+ out1 = __PKHTB(in1, in2, 16);
+ out2 = __PKHBT(in2, in1, 16);
+#endif
+
+ write_q15x2_ia (&pDst, out1);
+ write_q15x2_ia (&pDst, out2);
+
+#else
+
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+ *pDst++ = (q15_t) *pIn++ << 8;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q15_t) A << 8 */
+
+ /* Convert from q7 to q15 and store result in destination buffer */
+ *pDst++ = (q15_t) * pIn++ << 8;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
new file mode 100644
index 0000000000..a5373cedfe
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_q7_to_q31.c
@@ -0,0 +1,164 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_q7_to_q31.c
+ * Description: Converts the elements of the Q7 vector to Q31 vector
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/support_functions.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup q7_to_x
+ @{
+ */
+
+/**
+ @brief Converts the elements of the Q7 vector to Q31 vector.
+ @param[in] pSrc points to the Q7 input vector
+ @param[out] pDst points to the Q31 output vector
+ @param[in] blockSize number of samples in each vector
+ @return none
+
+ @par Details
+ The equation used for the conversion process is:
+
+ pDst[n] = (q31_t) pSrc[n] << 24; 0 <= n < blockSize.
+
+ */
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+void arm_q7_to_q31(
+ const q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt;
+ q31x4_t vecDst;
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0U)
+ {
+
+ /* C = (q31_t)A << 16 */
+ /* convert from q15 to q31 and then store the results in the destination buffer */
+ /* load q15 + 32-bit widening */
+ vecDst = vldrbq_s32((q7_t const *) pSrc);
+ vecDst = vshlq_n(vecDst, 24);
+ vstrwq_s32(pDst, vecDst);
+
+ /*
+ * Decrement the blockSize loop counter
+ * Advance vector source and destination pointers
+ */
+ pDst += 4;
+ pSrc += 4;
+ blkCnt --;
+ }
+
+ blkCnt = blockSize & 3;
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) *pSrc++ << 24;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+}
+
+#else
+void arm_q7_to_q31(
+ const q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* Loop counter */
+ const q7_t *pIn = pSrc; /* Source pointer */
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ q31_t in;
+
+ /* Loop unrolling: Compute 4 outputs at a time */
+ blkCnt = blockSize >> 2U;
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ in = read_q7x4_ia ((q7_t **) &pIn);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (in & 0xFF000000);
+
+#else
+
+ *pDst++ = (in & 0xFF000000);
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+ /* Loop unrolling: Compute remaining outputs */
+ blkCnt = blockSize % 0x4U;
+
+#else
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = blockSize;
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+ while (blkCnt > 0U)
+ {
+ /* C = (q31_t) A << 24 */
+
+ /* Convert from q7 to q31 and store result in destination buffer */
+ *pDst++ = (q31_t) * pIn++ << 24;
+
+ /* Decrement loop counter */
+ blkCnt--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @} end of q7_to_x group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
new file mode 100644
index 0000000000..a3773b8571
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_quick_sort_f32.c
@@ -0,0 +1,181 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_quick_sort_f32.c
+ * Description: Floating point quick sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+static uint32_t arm_quick_sort_partition_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+ /* This function will be called */
+ int32_t i, j, pivot_index;
+ float32_t pivot;
+ float32_t temp;
+
+ /* The first element is the pivot */
+ pivot_index = first;
+ pivot = pSrc[pivot_index];
+
+ /* Initialize indices for do-while loops */
+ i = first - 1;
+ j = last + 1;
+
+ while(i < j)
+ {
+ /* The loop will stop as soon as the indices i and j cross each other.
+ *
+ * This event will happen surely since the values of the indices are incremented and
+ * decrement in the do-while loops that are executed at least once.
+ * It is impossible to loop forever inside the do-while loops since the pivot is
+ * always an element of the array and the conditions cannot be always true (at least
+ * the i-th or the j-th element will be equal to the pivot-th element).
+ * For example, in the extreme case of an ordered array the do-while loop related to i will stop
+ * at the first iteration (because pSrc[i]=pSrc[pivot] already), and the loop related to j
+ * will stop after (last-first) iterations (when j=pivot=i=first). j is returned and
+ * j+1 is going to be used as pivot by other calls of the function, until j=pivot=last. */
+
+ /* Move indices to the right and to the left */
+ if(dir)
+ {
+ /* Compare left elements with pivot */
+ do
+ {
+ i++;
+ } while (pSrc[i] < pivot && i pivot);
+ }
+ else
+ {
+ /* Compare left elements with pivot */
+ do
+ {
+ i++;
+ } while (pSrc[i] > pivot && i Swap */
+ temp=pSrc[i];
+ pSrc[i]=pSrc[j];
+ pSrc[j]=temp;
+ }
+ }
+
+ return j;
+}
+
+static void arm_quick_sort_core_f32(float32_t *pSrc, int32_t first, int32_t last, uint8_t dir)
+{
+ /* If the array [first ... last] has more than one element */
+ if(firstdir);
+ /* The previous function could be called recursively a maximum
+ * of (blockSize-1) times, generating a stack consumption of 4*(blockSize-1) bytes. */
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
new file mode 100644
index 0000000000..a125028d02
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_selection_sort_f32.c
@@ -0,0 +1,107 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_selection_sort_f32.c
+ * Description: Floating point selection sort
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+/**
+ * @private
+ * @param[in] S points to an instance of the sorting structure.
+ * @param[in] pSrc points to the block of input data.
+ * @param[out] pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ *
+ * @par Algorithm
+ * The Selection sort algorithm is a comparison algorithm that
+ * divides the input array into a sorted and an unsorted sublist
+ * (initially the sorted sublist is empty and the unsorted sublist
+ * is the input array), looks for the smallest (or biggest)
+ * element in the unsorted sublist, swapping it with the leftmost
+ * one, and moving the sublists boundary one element to the right.
+ *
+ * @par It's an in-place algorithm. In order to obtain an out-of-place
+ * function, a memcpy of the source vector is performed.
+ */
+
+void arm_selection_sort_f32(
+ const arm_sort_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t i, j, k;
+ uint8_t dir = S->dir;
+ float32_t temp;
+
+ float32_t * pA;
+
+ if(pSrc != pDst) // out-of-place
+ {
+ memcpy(pDst, pSrc, blockSize*sizeof(float32_t) );
+ pA = pDst;
+ }
+ else
+ pA = pSrc;
+
+ /* Move the boundary one element to the right */
+ for (i=0; ialg)
+ {
+ case ARM_SORT_BITONIC:
+ arm_bitonic_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_BUBBLE:
+ arm_bubble_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_HEAP:
+ arm_heap_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_INSERTION:
+ arm_insertion_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_QUICK:
+ arm_quick_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+
+ case ARM_SORT_SELECTION:
+ arm_selection_sort_f32(S, pSrc, pDst, blockSize);
+ break;
+ }
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
new file mode 100644
index 0000000000..72ad9c5575
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_sort_init_f32.c
@@ -0,0 +1,54 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_sort_init_f32.c
+ * Description: Floating point sort initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_sorting.h"
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @addtogroup Sorting
+ @{
+ */
+
+
+ /**
+ * @param[in,out] S points to an instance of the sorting structure.
+ * @param[in] alg Selected algorithm.
+ * @param[in] dir Sorting order.
+ */
+void arm_sort_init_f32(arm_sort_instance_f32 * S, arm_sort_alg alg, arm_sort_dir dir)
+{
+ S->alg = alg;
+ S->dir = dir;
+}
+
+/**
+ @} end of Sorting group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
new file mode 100644
index 0000000000..68a7d94353
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f16.c
@@ -0,0 +1,146 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_weighted_sum_f16.c
+ * Description: Weighted Sum
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+
+#include "dsp/support_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupSupport
+ */
+
+/**
+ @defgroup weightedsum Weighted Sum
+
+ Weighted sum of values
+ */
+
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in] *in Array of input values.
+ * @param[in] *weigths Weights
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float16_t arm_weighted_sum_f16(const float16_t *in,const float16_t *weigths, uint32_t blockSize)
+{
+ _Float16 accum1, accum2;
+ float16x8_t accum1V, accum2V;
+ float16x8_t inV, wV;
+ const float16_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+
+ accum1V = vdupq_n_f16(0.0f16);
+ accum2V = vdupq_n_f16(0.0f16);
+
+ blkCnt = blockSize >> 3;
+ while (blkCnt > 0)
+ {
+ inV = vld1q(pIn);
+ wV = vld1q(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vfmaq(accum1V, inV, wV);
+ accum2V = vaddq(accum2V, wV);
+ blkCnt--;
+ }
+
+ accum1 = vecAddAcrossF16Mve(accum1V);
+ accum2 = vecAddAcrossF16Mve(accum2V);
+
+ blkCnt = blockSize & 7;
+ while(blkCnt > 0)
+ {
+ accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+ accum2 += (_Float16)*pW++;
+ blkCnt--;
+ }
+
+
+ return (accum1 / accum2);
+}
+
+#else
+
+float16_t arm_weighted_sum_f16(const float16_t *in, const float16_t *weigths, uint32_t blockSize)
+{
+
+ _Float16 accum1, accum2;
+ const float16_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f16;
+ accum2=0.0f16;
+
+ blkCnt = blockSize;
+ while(blkCnt > 0)
+ {
+ accum1 += (_Float16)*pIn++ * (_Float16)*pW;
+ accum2 += (_Float16)*pW++;
+ blkCnt--;
+ }
+
+ return(accum1 / accum2);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
new file mode 100644
index 0000000000..11ebfc779c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/SupportFunctions/arm_weighted_sum_f32.c
@@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_weighted_sum_f32.c
+ * Description: Weighted Sum
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include
+#include
+
+#include "dsp/support_functions.h"
+
+/**
+ * @addtogroup weightedsum
+ * @{
+ */
+
+
+/**
+ * @brief Weighted sum
+ *
+ *
+ * @param[in] *in Array of input values.
+ * @param[in] *weigths Weights
+ * @param[in] blockSize Number of samples in the input array.
+ * @return Weighted sum
+ *
+ */
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+ float32_t accum1, accum2;
+ f32x4_t accum1V, accum2V;
+ f32x4_t inV, wV;
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+
+ accum1V = vdupq_n_f32(0.0);
+ accum2V = vdupq_n_f32(0.0);
+
+ blkCnt = blockSize >> 2;
+ while (blkCnt > 0)
+ {
+ inV = vld1q(pIn);
+ wV = vld1q(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vfmaq(accum1V, inV, wV);
+ accum2V = vaddq(accum2V, wV);
+ blkCnt--;
+ }
+
+ accum1 = vecAddAcrossF32Mve(accum1V);
+ accum2 = vecAddAcrossF32Mve(accum2V);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+
+ return (accum1 / accum2);
+}
+
+#else
+#if defined(ARM_MATH_NEON)
+
+#include "NEMath.h"
+float32_t arm_weighted_sum_f32(const float32_t *in,const float32_t *weigths, uint32_t blockSize)
+{
+
+ float32_t accum1, accum2;
+ float32x4_t accum1V, accum2V;
+ float32x2_t tempV;
+
+ float32x4_t inV,wV;
+
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f;
+ accum2=0.0f;
+
+ accum1V = vdupq_n_f32(0.0f);
+ accum2V = vdupq_n_f32(0.0f);
+
+ blkCnt = blockSize >> 2;
+ while(blkCnt > 0)
+ {
+ inV = vld1q_f32(pIn);
+ wV = vld1q_f32(pW);
+
+ pIn += 4;
+ pW += 4;
+
+ accum1V = vmlaq_f32(accum1V,inV,wV);
+ accum2V = vaddq_f32(accum2V,wV);
+ blkCnt--;
+ }
+
+ tempV = vpadd_f32(vget_low_f32(accum1V),vget_high_f32(accum1V));
+ accum1 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+ tempV = vpadd_f32(vget_low_f32(accum2V),vget_high_f32(accum2V));
+ accum2 = vget_lane_f32(tempV, 0) + vget_lane_f32(tempV, 1);
+
+ blkCnt = blockSize & 3;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+
+ return(accum1 / accum2);
+}
+#else
+float32_t arm_weighted_sum_f32(const float32_t *in, const float32_t *weigths, uint32_t blockSize)
+{
+
+ float32_t accum1, accum2;
+ const float32_t *pIn, *pW;
+ uint32_t blkCnt;
+
+
+ pIn = in;
+ pW = weigths;
+
+ accum1=0.0f;
+ accum2=0.0f;
+
+ blkCnt = blockSize;
+ while(blkCnt > 0)
+ {
+ accum1 += *pIn++ * *pW;
+ accum2 += *pW++;
+ blkCnt--;
+ }
+
+ return(accum1 / accum2);
+}
+#endif
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ * @} end of weightedsum group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
new file mode 100644
index 0000000000..5f43a7af37
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctions.c
@@ -0,0 +1,74 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: TransformFunctions.c
+ * Description: Combination of all transform function source files.
+ *
+ * $Date: 18. March 2019
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_bitreversal.c"
+#include "arm_bitreversal2.c"
+#include "arm_cfft_f32.c"
+#include "arm_cfft_f64.c"
+#include "arm_cfft_q15.c"
+#include "arm_cfft_q31.c"
+#include "arm_cfft_init_f32.c"
+#include "arm_cfft_init_f64.c"
+#include "arm_cfft_init_q15.c"
+#include "arm_cfft_init_q31.c"
+#include "arm_cfft_radix2_f32.c"
+#include "arm_cfft_radix2_q15.c"
+#include "arm_cfft_radix2_q31.c"
+#include "arm_cfft_radix4_f32.c"
+#include "arm_cfft_radix4_q15.c"
+#include "arm_cfft_radix4_q31.c"
+#include "arm_cfft_radix8_f32.c"
+#include "arm_rfft_fast_f32.c"
+#include "arm_rfft_fast_f64.c"
+#include "arm_rfft_fast_init_f32.c"
+#include "arm_rfft_fast_init_f64.c"
+
+/* Deprecated */
+
+#include "arm_dct4_f32.c"
+#include "arm_dct4_init_f32.c"
+#include "arm_dct4_init_q15.c"
+#include "arm_dct4_init_q31.c"
+#include "arm_dct4_q15.c"
+#include "arm_dct4_q31.c"
+
+#include "arm_rfft_f32.c"
+#include "arm_rfft_q15.c"
+#include "arm_rfft_q31.c"
+
+#include "arm_rfft_init_f32.c"
+#include "arm_rfft_init_q15.c"
+#include "arm_rfft_init_q31.c"
+
+#include "arm_cfft_radix4_init_f32.c"
+#include "arm_cfft_radix4_init_q15.c"
+#include "arm_cfft_radix4_init_q31.c"
+
+#include "arm_cfft_radix2_init_f32.c"
+#include "arm_cfft_radix2_init_q15.c"
+#include "arm_cfft_radix2_init_q31.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c
new file mode 100644
index 0000000000..76e610e681
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/TransformFunctionsF16.c
@@ -0,0 +1,41 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: TransformFunctionsF16.c
+ * Description: Combination of all transform function f16 source files.
+ *
+ * $Date: 20. April 2020
+ * $Revision: V1.0.0
+ *
+ * Target Processor: Cortex-M cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "arm_cfft_f16.c"
+#include "arm_cfft_init_f16.c"
+#include "arm_cfft_radix2_f16.c"
+#include "arm_cfft_radix4_f16.c"
+#include "arm_rfft_fast_init_f16.c"
+#include "arm_rfft_fast_f16.c"
+#include "arm_cfft_radix8_f16.c"
+
+#include "arm_bitreversal_f16.c"
+
+/* Deprecated */
+#include "arm_cfft_radix2_init_f16.c"
+#include "arm_cfft_radix4_init_f16.c"
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
new file mode 100644
index 0000000000..687a9e86a7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal.c
@@ -0,0 +1,230 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitreversal.c
+ * Description: Bitreversal functions
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+
+/**
+ @brief In-place floating-point bit reversal function.
+ @param[in,out] pSrc points to in-place floating-point data buffer
+ @param[in] fftSize length of FFT
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+ */
+
+void arm_bitreversal_f32(
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
+{
+ uint16_t fftLenBy2, fftLenBy2p1;
+ uint16_t i, j;
+ float32_t in;
+
+ /* Initializations */
+ j = 0U;
+ fftLenBy2 = fftSize >> 1U;
+ fftLenBy2p1 = (fftSize >> 1U) + 1U;
+
+ /* Bit Reversal Implementation */
+ for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+ {
+ if (i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ in = pSrc[2U * i];
+ pSrc[2U * i] = pSrc[2U * j];
+ pSrc[2U * j] = in;
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[(2U * i) + 1U];
+ pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+ pSrc[(2U * j) + 1U] = in;
+
+ /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+ in = pSrc[2U * (i + fftLenBy2p1)];
+ pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+ pSrc[2U * (j + fftLenBy2p1)] = in;
+
+ /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+ in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+ }
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[2U * (i + 1U)];
+ pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+ pSrc[2U * (j + fftLenBy2)] = in;
+
+ /* pSrc[i+2U] <-> pSrc[j+2U] */
+ in = pSrc[(2U * (i + 1U)) + 1U];
+ pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+ pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
+
+
+/**
+ @brief In-place Q31 bit reversal function.
+ @param[in,out] pSrc points to in-place Q31 data buffer.
+ @param[in] fftLen length of FFT.
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
+{
+ uint32_t fftLenBy2, fftLenBy2p1, i, j;
+ q31_t in;
+
+ /* Initializations */
+ j = 0U;
+ fftLenBy2 = fftLen / 2U;
+ fftLenBy2p1 = (fftLen / 2U) + 1U;
+
+ /* Bit Reversal Implementation */
+ for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+ {
+ if (i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ in = pSrc[2U * i];
+ pSrc[2U * i] = pSrc[2U * j];
+ pSrc[2U * j] = in;
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[(2U * i) + 1U];
+ pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+ pSrc[(2U * j) + 1U] = in;
+
+ /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+ in = pSrc[2U * (i + fftLenBy2p1)];
+ pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+ pSrc[2U * (j + fftLenBy2p1)] = in;
+
+ /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+ in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+ }
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[2U * (i + 1U)];
+ pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+ pSrc[2U * (j + fftLenBy2)] = in;
+
+ /* pSrc[i+2U] <-> pSrc[j+2U] */
+ in = pSrc[(2U * (i + 1U)) + 1U];
+ pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+ pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
+
+
+
+/**
+ @brief In-place Q15 bit reversal function.
+ @param[in,out] pSrc16 points to in-place Q15 data buffer
+ @param[in] fftLen length of FFT
+ @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab)
+{
+ q31_t *pSrc = (q31_t *) pSrc16;
+ q31_t in;
+ uint32_t fftLenBy2, fftLenBy2p1;
+ uint32_t i, j;
+
+ /* Initializations */
+ j = 0U;
+ fftLenBy2 = fftLen / 2U;
+ fftLenBy2p1 = (fftLen / 2U) + 1U;
+
+ /* Bit Reversal Implementation */
+ for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+ {
+ if (i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[i];
+ pSrc[i] = pSrc[j];
+ pSrc[j] = in;
+
+ /* pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1]; */
+ /* pSrc[i + fftLenBy2p1+1U] <-> pSrc[j + fftLenBy2p1+1U] */
+ in = pSrc[i + fftLenBy2p1];
+ pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
+ pSrc[j + fftLenBy2p1] = in;
+ }
+
+ /* pSrc[i+1U] <-> pSrc[j+fftLenBy2]; */
+ /* pSrc[i+2] <-> pSrc[j+fftLenBy2+1U] */
+ in = pSrc[i + 1U];
+ pSrc[i + 1U] = pSrc[j + fftLenBy2];
+ pSrc[j + fftLenBy2] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S
new file mode 100644
index 0000000000..c16091b15d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.S
@@ -0,0 +1,216 @@
+;/* ----------------------------------------------------------------------
+; * Project: CMSIS DSP Library
+; * Title: arm_bitreversal2.S
+; * Description: arm_bitreversal_32 function done in assembly for maximum speed.
+; * Called after doing an fft to reorder the output.
+; * The function is loop unrolled by 2. arm_bitreversal_16 as well.
+; *
+; * $Date: 18. March 2019
+; * $Revision: V1.5.2
+; *
+; * Target Processor: Cortex-M cores
+; * -------------------------------------------------------------------- */
+;/*
+; * Copyright (C) 2010-2019 ARM Limited or its affiliates. All rights reserved.
+; *
+; * SPDX-License-Identifier: Apache-2.0
+; *
+; * Licensed under the Apache License, Version 2.0 (the License); you may
+; * not use this file except in compliance with the License.
+; * You may obtain a copy of the License at
+; *
+; * www.apache.org/licenses/LICENSE-2.0
+; *
+; * Unless required by applicable law or agreed to in writing, software
+; * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+; * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the License for the specific language governing permissions and
+; * limitations under the License.
+; */
+
+#if defined ( __CC_ARM ) /* Keil */
+ #define CODESECT AREA ||.text||, CODE, READONLY, ALIGN=2
+ #define LABEL
+#elif defined ( __IASMARM__ ) /* IAR */
+ #define CODESECT SECTION `.text`:CODE
+ #define PROC
+ #define LABEL
+ #define ENDP
+ #define EXPORT PUBLIC
+#elif defined ( __CSMC__ ) /* Cosmic */
+ #define CODESECT switch .text
+ #define THUMB
+ #define EXPORT xdef
+ #define PROC :
+ #define LABEL :
+ #define ENDP
+ #define arm_bitreversal_32 _arm_bitreversal_32
+#elif defined ( __TI_ARM__ ) /* TI ARM */
+ #define THUMB .thumb
+ #define CODESECT .text
+ #define EXPORT .global
+ #define PROC : .asmfunc
+ #define LABEL :
+ #define ENDP .endasmfunc
+ #define END
+#elif defined ( __GNUC__ ) /* GCC */
+ #define THUMB .thumb
+ #define CODESECT .section .text
+ #define EXPORT .global
+ #define PROC :
+ #define LABEL :
+ #define ENDP
+ #define END
+
+ .syntax unified
+#endif
+
+ CODESECT
+ THUMB
+
+;/**
+; @brief In-place bit reversal function.
+; @param[in,out] pSrc points to the in-place buffer of unknown 32-bit data type
+; @param[in] bitRevLen bit reversal table length
+; @param[in] pBitRevTab points to bit reversal table
+; @return none
+; */
+ EXPORT arm_bitreversal_32
+ EXPORT arm_bitreversal_16
+
+#if defined ( __CC_ARM ) /* Keil */
+#elif defined ( __IASMARM__ ) /* IAR */
+#elif defined ( __CSMC__ ) /* Cosmic */
+#elif defined ( __TI_ARM__ ) /* TI ARM */
+#elif defined ( __GNUC__ ) /* GCC */
+ .type arm_bitreversal_16, %function
+ .type arm_bitreversal_32, %function
+#endif
+
+#if defined (ARM_MATH_CM0_FAMILY)
+
+arm_bitreversal_32 PROC
+ ADDS r3,r1,#1
+ PUSH {r4-r6}
+ ADDS r1,r2,#0
+ LSRS r3,r3,#1
+arm_bitreversal_32_0 LABEL
+ LDRH r2,[r1,#2]
+ LDRH r6,[r1,#0]
+ ADD r2,r0,r2
+ ADD r6,r0,r6
+ LDR r5,[r2,#0]
+ LDR r4,[r6,#0]
+ STR r5,[r6,#0]
+ STR r4,[r2,#0]
+ LDR r5,[r2,#4]
+ LDR r4,[r6,#4]
+ STR r5,[r6,#4]
+ STR r4,[r2,#4]
+ ADDS r1,r1,#4
+ SUBS r3,r3,#1
+ BNE arm_bitreversal_32_0
+ POP {r4-r6}
+ BX lr
+ ENDP
+
+arm_bitreversal_16 PROC
+ ADDS r3,r1,#1
+ PUSH {r4-r6}
+ ADDS r1,r2,#0
+ LSRS r3,r3,#1
+arm_bitreversal_16_0 LABEL
+ LDRH r2,[r1,#2]
+ LDRH r6,[r1,#0]
+ LSRS r2,r2,#1
+ LSRS r6,r6,#1
+ ADD r2,r0,r2
+ ADD r6,r0,r6
+ LDR r5,[r2,#0]
+ LDR r4,[r6,#0]
+ STR r5,[r6,#0]
+ STR r4,[r2,#0]
+ ADDS r1,r1,#4
+ SUBS r3,r3,#1
+ BNE arm_bitreversal_16_0
+ POP {r4-r6}
+ BX lr
+ ENDP
+
+#else
+
+arm_bitreversal_32 PROC
+ ADDS r3,r1,#1
+ CMP r3,#1
+ IT LS
+ BXLS lr
+ PUSH {r4-r9}
+ ADDS r1,r2,#2
+ LSRS r3,r3,#2
+arm_bitreversal_32_0 LABEL ;/* loop unrolled by 2 */
+ LDRH r8,[r1,#4]
+ LDRH r9,[r1,#2]
+ LDRH r2,[r1,#0]
+ LDRH r12,[r1,#-2]
+ ADD r8,r0,r8
+ ADD r9,r0,r9
+ ADD r2,r0,r2
+ ADD r12,r0,r12
+ LDR r7,[r9,#0]
+ LDR r6,[r8,#0]
+ LDR r5,[r2,#0]
+ LDR r4,[r12,#0]
+ STR r6,[r9,#0]
+ STR r7,[r8,#0]
+ STR r5,[r12,#0]
+ STR r4,[r2,#0]
+ LDR r7,[r9,#4]
+ LDR r6,[r8,#4]
+ LDR r5,[r2,#4]
+ LDR r4,[r12,#4]
+ STR r6,[r9,#4]
+ STR r7,[r8,#4]
+ STR r5,[r12,#4]
+ STR r4,[r2,#4]
+ ADDS r1,r1,#8
+ SUBS r3,r3,#1
+ BNE arm_bitreversal_32_0
+ POP {r4-r9}
+ BX lr
+ ENDP
+
+arm_bitreversal_16 PROC
+ ADDS r3,r1,#1
+ CMP r3,#1
+ IT LS
+ BXLS lr
+ PUSH {r4-r9}
+ ADDS r1,r2,#2
+ LSRS r3,r3,#2
+arm_bitreversal_16_0 LABEL ;/* loop unrolled by 2 */
+ LDRH r8,[r1,#4]
+ LDRH r9,[r1,#2]
+ LDRH r2,[r1,#0]
+ LDRH r12,[r1,#-2]
+ ADD r8,r0,r8,LSR #1
+ ADD r9,r0,r9,LSR #1
+ ADD r2,r0,r2,LSR #1
+ ADD r12,r0,r12,LSR #1
+ LDR r7,[r9,#0]
+ LDR r6,[r8,#0]
+ LDR r5,[r2,#0]
+ LDR r4,[r12,#0]
+ STR r6,[r9,#0]
+ STR r7,[r8,#0]
+ STR r5,[r12,#0]
+ STR r4,[r2,#0]
+ ADDS r1,r1,#8
+ SUBS r3,r3,#1
+ BNE arm_bitreversal_16_0
+ POP {r4-r9}
+ BX lr
+ ENDP
+
+#endif
+
+ END
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
new file mode 100644
index 0000000000..77fac1f834
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal2.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitreversal2.c
+ * Description: Bitreversal functions
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2019 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+
+/**
+ @brief In-place 64 bit reversal function.
+ @param[in,out] pSrc points to in-place buffer of unknown 64-bit data type
+ @param[in] bitRevLen bit reversal table length
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_64(
+ uint64_t *pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t *pBitRevTab)
+{
+ uint64_t a, b, i, tmp;
+
+ for (i = 0; i < bitRevLen; )
+ {
+ a = pBitRevTab[i ] >> 2;
+ b = pBitRevTab[i + 1] >> 2;
+
+ //real
+ tmp = pSrc[a];
+ pSrc[a] = pSrc[b];
+ pSrc[b] = tmp;
+
+ //complex
+ tmp = pSrc[a+1];
+ pSrc[a+1] = pSrc[b+1];
+ pSrc[b+1] = tmp;
+
+ i += 2;
+ }
+}
+
+/**
+ @brief In-place 32 bit reversal function.
+ @param[in,out] pSrc points to in-place buffer of unknown 32-bit data type
+ @param[in] bitRevLen bit reversal table length
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_32(
+ uint32_t *pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t *pBitRevTab)
+{
+ uint32_t a, b, i, tmp;
+
+ for (i = 0; i < bitRevLen; )
+ {
+ a = pBitRevTab[i ] >> 2;
+ b = pBitRevTab[i + 1] >> 2;
+
+ //real
+ tmp = pSrc[a];
+ pSrc[a] = pSrc[b];
+ pSrc[b] = tmp;
+
+ //complex
+ tmp = pSrc[a+1];
+ pSrc[a+1] = pSrc[b+1];
+ pSrc[b+1] = tmp;
+
+ i += 2;
+ }
+}
+
+
+/**
+ @brief In-place 16 bit reversal function.
+ @param[in,out] pSrc points to in-place buffer of unknown 16-bit data type
+ @param[in] bitRevLen bit reversal table length
+ @param[in] pBitRevTab points to bit reversal table
+ @return none
+*/
+
+void arm_bitreversal_16(
+ uint16_t *pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t *pBitRevTab)
+{
+ uint16_t a, b, i, tmp;
+
+ for (i = 0; i < bitRevLen; )
+ {
+ a = pBitRevTab[i ] >> 2;
+ b = pBitRevTab[i + 1] >> 2;
+
+ //real
+ tmp = pSrc[a];
+ pSrc[a] = pSrc[b];
+ pSrc[b] = tmp;
+
+ //complex
+ tmp = pSrc[a+1];
+ pSrc[a+1] = pSrc[b+1];
+ pSrc[b+1] = tmp;
+
+ i += 2;
+ }
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c
new file mode 100644
index 0000000000..2e6463682e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_bitreversal_f16.c
@@ -0,0 +1,102 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_bitreversal_f16.c
+ * Description: Bitreversal functions
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+/*
+* @brief In-place bit reversal function.
+* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+* @param[in] fftSize length of the FFT.
+* @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table.
+* @param[in] *pBitRevTab points to the bit reversal table.
+* @return none.
+*/
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_bitreversal_f16(
+float16_t * pSrc,
+uint16_t fftSize,
+uint16_t bitRevFactor,
+const uint16_t * pBitRevTab)
+{
+ uint16_t fftLenBy2, fftLenBy2p1;
+ uint16_t i, j;
+ float16_t in;
+
+ /* Initializations */
+ j = 0U;
+ fftLenBy2 = fftSize >> 1U;
+ fftLenBy2p1 = (fftSize >> 1U) + 1U;
+
+ /* Bit Reversal Implementation */
+ for (i = 0U; i <= (fftLenBy2 - 2U); i += 2U)
+ {
+ if (i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ in = pSrc[2U * i];
+ pSrc[2U * i] = pSrc[2U * j];
+ pSrc[2U * j] = in;
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[(2U * i) + 1U];
+ pSrc[(2U * i) + 1U] = pSrc[(2U * j) + 1U];
+ pSrc[(2U * j) + 1U] = in;
+
+ /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+ in = pSrc[2U * (i + fftLenBy2p1)];
+ pSrc[2U * (i + fftLenBy2p1)] = pSrc[2U * (j + fftLenBy2p1)];
+ pSrc[2U * (j + fftLenBy2p1)] = in;
+
+ /* pSrc[i+fftLenBy2p1+1U] <-> pSrc[j+fftLenBy2p1+1U] */
+ in = pSrc[(2U * (i + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (i + fftLenBy2p1)) + 1U] =
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U];
+ pSrc[(2U * (j + fftLenBy2p1)) + 1U] = in;
+
+ }
+
+ /* pSrc[i+1U] <-> pSrc[j+1U] */
+ in = pSrc[2U * (i + 1U)];
+ pSrc[2U * (i + 1U)] = pSrc[2U * (j + fftLenBy2)];
+ pSrc[2U * (j + fftLenBy2)] = in;
+
+ /* pSrc[i+2U] <-> pSrc[j+2U] */
+ in = pSrc[(2U * (i + 1U)) + 1U];
+ pSrc[(2U * (i + 1U)) + 1U] = pSrc[(2U * (j + fftLenBy2)) + 1U];
+ pSrc[(2U * (j + fftLenBy2)) + 1U] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c
new file mode 100644
index 0000000000..239e0031ed
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f16.c
@@ -0,0 +1,842 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_f32.c
+ * Description: Combined Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+#include "arm_mve_tables_f16.h"
+
+
+static float16_t arm_inverse_fft_length_f16(uint16_t fftLen)
+{
+ float16_t retValue=1.0;
+
+ switch (fftLen)
+ {
+
+ case 4096U:
+ retValue = (float16_t)0.000244140625f;
+ break;
+
+ case 2048U:
+ retValue = (float16_t)0.00048828125f;
+ break;
+
+ case 1024U:
+ retValue = (float16_t)0.0009765625f;
+ break;
+
+ case 512U:
+ retValue = (float16_t)0.001953125f;
+ break;
+
+ case 256U:
+ retValue = (float16_t)0.00390625f;
+ break;
+
+ case 128U:
+ retValue = (float16_t)0.0078125f;
+ break;
+
+ case 64U:
+ retValue = (float16_t)0.015625f;
+ break;
+
+ case 32U:
+ retValue = (float16_t)0.03125f;
+ break;
+
+ case 16U:
+ retValue = (float16_t)0.0625f;
+ break;
+
+
+ default:
+ break;
+ }
+ return(retValue);
+}
+
+
+static void _arm_radix4_butterfly_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen)
+{
+ f16x8_t vecTmp0, vecTmp1;
+ f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ f16x8_t vecA, vecB, vecC, vecD;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] =
+ {(0 - 16) * sizeof(float16_t *)
+ , (4 - 16) * sizeof(float16_t *)
+ , (8 - 16) * sizeof(float16_t *)
+ , (12 - 16) * sizeof(float16_t *)};
+
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+ for (int k = fftLen / 4u; k > 1; k >>= 2)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ float16_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ float16_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ float16_t const *p_rearranged_twiddle_tab_stride3 =
+ &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ float16_t const *pW1, *pW2, *pW3;
+ float16_t *inA = pSrc + CMPLX_DIM * i * n1;
+ float16_t *inB = inA + n2 * CMPLX_DIM;
+ float16_t *inC = inB + n2 * CMPLX_DIM;
+ float16_t *inD = inC + n2 * CMPLX_DIM;
+ f16x8_t vecW;
+
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 4;
+ /*
+ * load 2 f16 complex pair
+ */
+ vecA = vldrhq_f16(inA);
+ vecC = vldrhq_f16(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrhq_f16(inB);
+ vecD = vldrhq_f16(inD);
+
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vecSum0 + vecSum1;
+ vst1q(inA, vecTmp0);
+ inA += 8;
+
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vecSum0 - vecSum1;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inB, vecTmp1);
+ inB += 8;
+
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 +=8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 8;
+
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 8;
+
+ vecA = vldrhq_f16(inA);
+ vecC = vldrhq_f16(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /* load scheduling */
+ vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+ blkCnt = (fftLen >> 4);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
+ vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+
+ /* pre-load for next iteration */
+ vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+ vecTmp0 = vecSum0 + vecSum1;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
+
+ vecTmp0 = vecSum0 - vecSum1;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
+
+ blkCnt--;
+ }
+
+ /*
+ * End of last stage process
+ */
+}
+
+static void arm_cfft_radix4by2_f16_mve(const arm_cfft_instance_f16 * S, float16_t *pSrc, uint32_t fftLen)
+{
+ float16_t const *pCoefVec;
+ float16_t const *pCoef = S->pTwiddle;
+ float16_t *pIn0, *pIn1;
+ uint32_t n2;
+ uint32_t blkCnt;
+ f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
+ f16x8_t vecCmplxTmp, vecTw;
+
+
+ n2 = fftLen >> 1;
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 4;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(f16x8_t *) pIn0;
+ vecIn1 = *(f16x8_t *) pIn1;
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 8;
+
+ vecSum = vaddq(vecIn0, vecIn1);
+ vecDiff = vsubq(vecIn0, vecIn1);
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
+
+ vst1q(pIn0, vecSum);
+ pIn0 += 8;
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 8;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_f16_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_f16_mve(S, pSrc + fftLen, n2);
+
+ pIn0 = pSrc;
+}
+
+static void _arm_radix4_butterfly_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t * pSrc, uint32_t fftLen, float16_t onebyfftLen)
+{
+ f16x8_t vecTmp0, vecTmp1;
+ f16x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ f16x8_t vecA, vecB, vecC, vecD;
+ f16x8_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q31_t *),
+ (4 - 16) * sizeof(q31_t *),
+ (8 - 16) * sizeof(q31_t *),
+ (12 - 16) * sizeof(q31_t *)
+ };
+
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+ for (int k = fftLen / 4; k > 1; k >>= 2)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ float16_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ float16_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ float16_t const *p_rearranged_twiddle_tab_stride3 =
+ &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ float16_t const *pW1, *pW2, *pW3;
+ float16_t *inA = pSrc + CMPLX_DIM * i * n1;
+ float16_t *inB = inA + n2 * CMPLX_DIM;
+ float16_t *inC = inB + n2 * CMPLX_DIM;
+ float16_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 4;
+ /*
+ * load 2 f32 complex pair
+ */
+ vecA = vldrhq_f16(inA);
+ vecC = vldrhq_f16(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrhq_f16(inB);
+ vecD = vldrhq_f16(inD);
+
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vecSum0 + vecSum1;
+ vst1q(inA, vecTmp0);
+ inA += 8;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vecSum0 - vecSum1;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW2);
+ pW2 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inB, vecTmp1);
+ inB += 8;
+
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW1);
+ pW1 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 8;
+
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 8;
+
+ vecA = vldrhq_f16(inA);
+ vecC = vldrhq_f16(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+ blkCnt = (fftLen >> 4);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecB = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 4);
+ vecD = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 12);
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+
+ vecA = (f16x8_t)vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = (f16x8_t)vldrwq_gather_base_f32(vecScGathAddr, 8);
+
+ vecTmp0 = vecSum0 + vecSum1;
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64, (f32x4_t)vecTmp0);
+
+ vecTmp0 = vecSum0 - vecSum1;
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 4, (f32x4_t)vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, (f32x4_t)vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 12, (f32x4_t)vecTmp0);
+
+ blkCnt--;
+ }
+
+ /*
+ * End of last stage process
+ */
+}
+
+static void arm_cfft_radix4by2_inverse_f16_mve(const arm_cfft_instance_f16 * S,float16_t *pSrc, uint32_t fftLen)
+{
+ float16_t const *pCoefVec;
+ float16_t const *pCoef = S->pTwiddle;
+ float16_t *pIn0, *pIn1;
+ uint32_t n2;
+ float16_t onebyfftLen = arm_inverse_fft_length_f16(fftLen);
+ uint32_t blkCnt;
+ f16x8_t vecIn0, vecIn1, vecSum, vecDiff;
+ f16x8_t vecCmplxTmp, vecTw;
+
+
+ n2 = fftLen >> 1;
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 4;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(f16x8_t *) pIn0;
+ vecIn1 = *(f16x8_t *) pIn1;
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 8;
+
+ vecSum = vaddq(vecIn0, vecIn1);
+ vecDiff = vsubq(vecIn0, vecIn1);
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
+
+ vst1q(pIn0, vecSum);
+ pIn0 += 8;
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 8;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, n2, onebyfftLen);
+
+ _arm_radix4_butterfly_inverse_f16_mve(S, pSrc + fftLen, n2, onebyfftLen);
+}
+
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point complex FFT.
+ @param[in] S points to an instance of the floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+
+void arm_cfft_f16(
+ const arm_cfft_instance_f16 * S,
+ float16_t * pSrc,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_f16_mve(S, pSrc, fftLen, arm_inverse_fft_length_f16(S->fftLen));
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_f16_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_f16_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_f16_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
+ arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+ }
+}
+
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+extern void arm_bitreversal_16(
+ uint16_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
+
+
+extern void arm_cfft_radix4by2_f16(
+ float16_t * pSrc,
+ uint32_t fftLen,
+ const float16_t * pCoef);
+
+extern void arm_radix4_butterfly_f16(
+ float16_t * pSrc,
+ uint16_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @defgroup ComplexFFT Complex FFT Functions
+
+ @par
+ The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
+ Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
+ than the DFT, especially for long lengths.
+ The algorithms described in this section
+ operate on complex data. A separate set of functions is devoted to handling
+ of real sequences.
+ @par
+ There are separate algorithms for handling floating-point, Q15, and Q31 data
+ types. The algorithms available for each data type are described next.
+ @par
+ The FFT functions operate in-place. That is, the array holding the input data
+ will also be used to hold the corresponding result. The input data is complex
+ and contains 2*fftLen interleaved values as shown below.
+ {real[0], imag[0], real[1], imag[1], ...}
+ The FFT result will be contained in the same array and the frequency domain
+ values will have the same interleaving.
+
+ @par Floating-point
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
+ stages are performed along with a single radix-2 or radix-4 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ For the MVE version, the new arm_cfft_init_f32 initialization function is
+ mandatory. Compilation flags are available to include only the required tables for the
+ needed FFTs. Other FFT versions can continue to be initialized as
+ explained below.
+ @par
+ For not MVE versions, pre-initialized data structures containing twiddle factors
+ and bit reversal tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_f32. For example:
+ @par
+ arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_f32 function follows:
+ @code
+ const static arm_cfft_instance_f32 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_f32_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_f32_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_f32_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_f32_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_f32_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_f32_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_f32_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_f32_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_f32_len4096;
+ break;
+ }
+ @endcode
+ @par
+ The new arm_cfft_init_f32 can also be used.
+ @par Q15 and Q31
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
+ stages are performed along with a single radix-2 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ Pre-initialized data structures containing twiddle factors and bit reversal
+ tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_q31. For example:
+ @par
+ arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_q31 function follows:
+ @code
+ const static arm_cfft_instance_q31 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_q31_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_q31_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_q31_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_q31_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_q31_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_q31_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_q31_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_q31_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_q31_len4096;
+ break;
+ }
+ @endcode
+
+ */
+
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point complex FFT.
+ @param[in] S points to an instance of the floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+void arm_cfft_f16(
+ const arm_cfft_instance_f16 * S,
+ float16_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t L = S->fftLen, l;
+ float16_t invL, * pSrc;
+
+ if (ifftFlag == 1U)
+ {
+ /* Conjugate input data */
+ pSrc = p1 + 1;
+ for(l=0; lpTwiddle, 1U);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_f16 ( p1, L, (float16_t*)S->pTwiddle);
+ break;
+
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_16((uint16_t*)p1, S->bitRevLength,(uint16_t*)S->pBitRevTable);
+
+ if (ifftFlag == 1U)
+ {
+ invL = 1.0f/(float16_t)L;
+ /* Conjugate and scale output data */
+ pSrc = p1;
+ for(l=0; l>= 2u;
+ for (int k = fftLen / 4u; k > 1; k >>= 2)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ float32_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ float32_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ float32_t const *p_rearranged_twiddle_tab_stride3 =
+ &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ float32_t const *pW1, *pW2, *pW3;
+ float32_t *inA = pSrc + CMPLX_DIM * i * n1;
+ float32_t *inB = inA + n2 * CMPLX_DIM;
+ float32_t *inC = inB + n2 * CMPLX_DIM;
+ float32_t *inD = inC + n2 * CMPLX_DIM;
+ f32x4_t vecW;
+
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 2;
+ /*
+ * load 2 f32 complex pair
+ */
+ vecA = vldrwq_f32(inA);
+ vecC = vldrwq_f32(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrwq_f32(inB);
+ vecD = vldrwq_f32(inD);
+
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vecSum0 + vecSum1;
+ vst1q(inA, vecTmp0);
+ inA += 4;
+
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vecSum0 - vecSum1;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inB, vecTmp1);
+ inB += 4;
+
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 +=4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 4;
+
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_Conj_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 4;
+
+ vecA = vldrwq_f32(inA);
+ vecC = vldrwq_f32(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /* load scheduling */
+ vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
+
+ blkCnt = (fftLen >> 3);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
+ vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+
+ /* pre-load for next iteration */
+ vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
+
+ vecTmp0 = vecSum0 + vecSum1;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
+
+ vecTmp0 = vecSum0 - vecSum1;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
+
+ blkCnt--;
+ }
+
+ /*
+ * End of last stage process
+ */
+}
+
+static void arm_cfft_radix4by2_f32_mve(const arm_cfft_instance_f32 * S, float32_t *pSrc, uint32_t fftLen)
+{
+ float32_t const *pCoefVec;
+ float32_t const *pCoef = S->pTwiddle;
+ float32_t *pIn0, *pIn1;
+ uint32_t n2;
+ uint32_t blkCnt;
+ f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
+ f32x4_t vecCmplxTmp, vecTw;
+
+
+ n2 = fftLen >> 1;
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 2;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(f32x4_t *) pIn0;
+ vecIn1 = *(f32x4_t *) pIn1;
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 4;
+
+ vecSum = vecIn0 + vecIn1;
+ vecDiff = vecIn0 - vecIn1;
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FLT_Conj_AxB(vecTw, vecDiff);
+
+ vst1q(pIn0, vecSum);
+ pIn0 += 4;
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 4;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_f32_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_f32_mve(S, pSrc + fftLen, n2);
+
+ pIn0 = pSrc;
+}
+
+static void _arm_radix4_butterfly_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t * pSrc, uint32_t fftLen, float32_t onebyfftLen)
+{
+ f32x4_t vecTmp0, vecTmp1;
+ f32x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ f32x4_t vecA, vecB, vecC, vecD;
+ f32x4_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q31_t *),
+ (1 - 16) * sizeof(q31_t *),
+ (8 - 16) * sizeof(q31_t *),
+ (9 - 16) * sizeof(q31_t *)
+ };
+
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+ for (int k = fftLen / 4; k > 1; k >>= 2)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ float32_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ float32_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ float32_t const *p_rearranged_twiddle_tab_stride3 =
+ &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ float32_t const *pW1, *pW2, *pW3;
+ float32_t *inA = pSrc + CMPLX_DIM * i * n1;
+ float32_t *inB = inA + n2 * CMPLX_DIM;
+ float32_t *inC = inB + n2 * CMPLX_DIM;
+ float32_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 2;
+ /*
+ * load 2 f32 complex pair
+ */
+ vecA = vldrwq_f32(inA);
+ vecC = vldrwq_f32(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrwq_f32(inB);
+ vecD = vldrwq_f32(inD);
+
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vecSum0 + vecSum1;
+ vst1q(inA, vecTmp0);
+ inA += 4;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vecSum0 - vecSum1;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW2);
+ pW2 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inB, vecTmp1);
+ inB += 4;
+
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW1);
+ pW1 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 4;
+
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FLT_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 4;
+
+ vecA = vldrwq_f32(inA);
+ vecC = vldrwq_f32(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32 (strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
+
+ blkCnt = (fftLen >> 3);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vecA + vecC; /* vecSum0 = vaddq(vecA, vecC) */
+ vecDiff0 = vecA - vecC; /* vecSum0 = vsubq(vecA, vecC) */
+
+ vecB = vldrwq_gather_base_f32(vecScGathAddr, 8);
+ vecD = vldrwq_gather_base_f32(vecScGathAddr, 24);
+
+ vecSum1 = vecB + vecD;
+ vecDiff1 = vecB - vecD;
+
+ vecA = vldrwq_gather_base_wb_f32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_f32(vecScGathAddr, 16);
+
+ vecTmp0 = vecSum0 + vecSum1;
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64, vecTmp0);
+
+ vecTmp0 = vecSum0 - vecSum1;
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 8, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_A_ixB(vecDiff0, vecDiff1);
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 16, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_A_ixB(vecDiff0, vecDiff1);
+ vecTmp0 = vecTmp0 * onebyfftLen;
+ vstrwq_scatter_base_f32(vecScGathAddr, -64 + 24, vecTmp0);
+
+ blkCnt--;
+ }
+
+ /*
+ * End of last stage process
+ */
+}
+
+static void arm_cfft_radix4by2_inverse_f32_mve(const arm_cfft_instance_f32 * S,float32_t *pSrc, uint32_t fftLen)
+{
+ float32_t const *pCoefVec;
+ float32_t const *pCoef = S->pTwiddle;
+ float32_t *pIn0, *pIn1;
+ uint32_t n2;
+ float32_t onebyfftLen = arm_inverse_fft_length_f32(fftLen);
+ uint32_t blkCnt;
+ f32x4_t vecIn0, vecIn1, vecSum, vecDiff;
+ f32x4_t vecCmplxTmp, vecTw;
+
+
+ n2 = fftLen >> 1;
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 2;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(f32x4_t *) pIn0;
+ vecIn1 = *(f32x4_t *) pIn1;
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 4;
+
+ vecSum = vecIn0 + vecIn1;
+ vecDiff = vecIn0 - vecIn1;
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FLT_AxB(vecTw, vecDiff);
+
+ vst1q(pIn0, vecSum);
+ pIn0 += 4;
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 4;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, n2, onebyfftLen);
+
+ _arm_radix4_butterfly_inverse_f32_mve(S, pSrc + fftLen, n2, onebyfftLen);
+}
+
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point complex FFT.
+ @param[in] S points to an instance of the floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+
+void arm_cfft_f32(
+ const arm_cfft_instance_f32 * S,
+ float32_t * pSrc,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_f32_mve(S, pSrc, fftLen, arm_inverse_fft_length_f32(S->fftLen));
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_f32_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_f32_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_f32_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
+ arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+ }
+}
+
+
+#else
+extern void arm_radix8_butterfly_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
+
+extern void arm_bitreversal_32(
+ uint32_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @defgroup ComplexFFT Complex FFT Functions
+
+ @par
+ The Fast Fourier Transform (FFT) is an efficient algorithm for computing the
+ Discrete Fourier Transform (DFT). The FFT can be orders of magnitude faster
+ than the DFT, especially for long lengths.
+ The algorithms described in this section
+ operate on complex data. A separate set of functions is devoted to handling
+ of real sequences.
+ @par
+ There are separate algorithms for handling floating-point, Q15, and Q31 data
+ types. The algorithms available for each data type are described next.
+ @par
+ The FFT functions operate in-place. That is, the array holding the input data
+ will also be used to hold the corresponding result. The input data is complex
+ and contains 2*fftLen interleaved values as shown below.
+ {real[0], imag[0], real[1], imag[1], ...}
+ The FFT result will be contained in the same array and the frequency domain
+ values will have the same interleaving.
+
+ @par Floating-point
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-8
+ stages are performed along with a single radix-2 or radix-4 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ For the MVE version, the new arm_cfft_init_f32 initialization function is
+ mandatory. Compilation flags are available to include only the required tables for the
+ needed FFTs. Other FFT versions can continue to be initialized as
+ explained below.
+ @par
+ For not MVE versions, pre-initialized data structures containing twiddle factors
+ and bit reversal tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_f32. For example:
+ @par
+ arm_cfft_f32(arm_cfft_sR_f32_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_f32 function follows:
+ @code
+ const static arm_cfft_instance_f32 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_f32_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_f32_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_f32_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_f32_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_f32_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_f32_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_f32_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_f32_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_f32_len4096;
+ break;
+ }
+ @endcode
+ @par
+ The new arm_cfft_init_f32 can also be used.
+ @par Q15 and Q31
+ The floating-point complex FFT uses a mixed-radix algorithm. Multiple radix-4
+ stages are performed along with a single radix-2 stage, as needed.
+ The algorithm supports lengths of [16, 32, 64, ..., 4096] and each length uses
+ a different twiddle factor table.
+ @par
+ The function uses the standard FFT definition and output values may grow by a
+ factor of fftLen when computing the forward transform. The
+ inverse transform includes a scale of 1/fftLen as part of the
+ calculation and this matches the textbook definition of the inverse FFT.
+ @par
+ Pre-initialized data structures containing twiddle factors and bit reversal
+ tables are provided and defined in arm_const_structs.h. Include
+ this header in your function and then pass one of the constant structures as
+ an argument to arm_cfft_q31. For example:
+ @par
+ arm_cfft_q31(arm_cfft_sR_q31_len64, pSrc, 1, 1)
+ @par
+ computes a 64-point inverse complex FFT including bit reversal.
+ The data structures are treated as constant data and not modified during the
+ calculation. The same data structure can be reused for multiple transforms
+ including mixing forward and inverse transforms.
+ @par
+ Earlier releases of the library provided separate radix-2 and radix-4
+ algorithms that operated on floating-point data. These functions are still
+ provided but are deprecated. The older functions are slower and less general
+ than the new functions.
+ @par
+ An example of initialization of the constants for the arm_cfft_q31 function follows:
+ @code
+ const static arm_cfft_instance_q31 *S;
+ ...
+ switch (length) {
+ case 16:
+ S = &arm_cfft_sR_q31_len16;
+ break;
+ case 32:
+ S = &arm_cfft_sR_q31_len32;
+ break;
+ case 64:
+ S = &arm_cfft_sR_q31_len64;
+ break;
+ case 128:
+ S = &arm_cfft_sR_q31_len128;
+ break;
+ case 256:
+ S = &arm_cfft_sR_q31_len256;
+ break;
+ case 512:
+ S = &arm_cfft_sR_q31_len512;
+ break;
+ case 1024:
+ S = &arm_cfft_sR_q31_len1024;
+ break;
+ case 2048:
+ S = &arm_cfft_sR_q31_len2048;
+ break;
+ case 4096:
+ S = &arm_cfft_sR_q31_len4096;
+ break;
+ }
+ @endcode
+
+ */
+
+void arm_cfft_radix8by2_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
+{
+ uint32_t L = S->fftLen;
+ float32_t * pCol1, * pCol2, * pMid1, * pMid2;
+ float32_t * p2 = p1 + L;
+ const float32_t * tw = (float32_t *) S->pTwiddle;
+ float32_t t1[4], t2[4], t3[4], t4[4], twR, twI;
+ float32_t m0, m1, m2, m3;
+ uint32_t l;
+
+ pCol1 = p1;
+ pCol2 = p2;
+
+ /* Define new length */
+ L >>= 1;
+
+ /* Initialize mid pointers */
+ pMid1 = p1 + L;
+ pMid2 = p2 + L;
+
+ /* do two dot Fourier transform */
+ for (l = L >> 2; l > 0; l-- )
+ {
+ t1[0] = p1[0];
+ t1[1] = p1[1];
+ t1[2] = p1[2];
+ t1[3] = p1[3];
+
+ t2[0] = p2[0];
+ t2[1] = p2[1];
+ t2[2] = p2[2];
+ t2[3] = p2[3];
+
+ t3[0] = pMid1[0];
+ t3[1] = pMid1[1];
+ t3[2] = pMid1[2];
+ t3[3] = pMid1[3];
+
+ t4[0] = pMid2[0];
+ t4[1] = pMid2[1];
+ t4[2] = pMid2[2];
+ t4[3] = pMid2[3];
+
+ *p1++ = t1[0] + t2[0];
+ *p1++ = t1[1] + t2[1];
+ *p1++ = t1[2] + t2[2];
+ *p1++ = t1[3] + t2[3]; /* col 1 */
+
+ t2[0] = t1[0] - t2[0];
+ t2[1] = t1[1] - t2[1];
+ t2[2] = t1[2] - t2[2];
+ t2[3] = t1[3] - t2[3]; /* for col 2 */
+
+ *pMid1++ = t3[0] + t4[0];
+ *pMid1++ = t3[1] + t4[1];
+ *pMid1++ = t3[2] + t4[2];
+ *pMid1++ = t3[3] + t4[3]; /* col 1 */
+
+ t4[0] = t4[0] - t3[0];
+ t4[1] = t4[1] - t3[1];
+ t4[2] = t4[2] - t3[2];
+ t4[3] = t4[3] - t3[3]; /* for col 2 */
+
+ twR = *tw++;
+ twI = *tw++;
+
+ /* multiply by twiddle factors */
+ m0 = t2[0] * twR;
+ m1 = t2[1] * twI;
+ m2 = t2[1] * twR;
+ m3 = t2[0] * twI;
+
+ /* R = R * Tr - I * Ti */
+ *p2++ = m0 + m1;
+ /* I = I * Tr + R * Ti */
+ *p2++ = m2 - m3;
+
+ /* use vertical symmetry */
+ /* 0.9988 - 0.0491i <==> -0.0491 - 0.9988i */
+ m0 = t4[0] * twI;
+ m1 = t4[1] * twR;
+ m2 = t4[1] * twI;
+ m3 = t4[0] * twR;
+
+ *pMid2++ = m0 - m1;
+ *pMid2++ = m2 + m3;
+
+ twR = *tw++;
+ twI = *tw++;
+
+ m0 = t2[2] * twR;
+ m1 = t2[3] * twI;
+ m2 = t2[3] * twR;
+ m3 = t2[2] * twI;
+
+ *p2++ = m0 + m1;
+ *p2++ = m2 - m3;
+
+ m0 = t4[2] * twI;
+ m1 = t4[3] * twR;
+ m2 = t4[3] * twI;
+ m3 = t4[2] * twR;
+
+ *pMid2++ = m0 - m1;
+ *pMid2++ = m2 + m3;
+ }
+
+ /* first col */
+ arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 2U);
+
+ /* second col */
+ arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 2U);
+}
+
+void arm_cfft_radix8by4_f32 (arm_cfft_instance_f32 * S, float32_t * p1)
+{
+ uint32_t L = S->fftLen >> 1;
+ float32_t * pCol1, *pCol2, *pCol3, *pCol4, *pEnd1, *pEnd2, *pEnd3, *pEnd4;
+ const float32_t *tw2, *tw3, *tw4;
+ float32_t * p2 = p1 + L;
+ float32_t * p3 = p2 + L;
+ float32_t * p4 = p3 + L;
+ float32_t t2[4], t3[4], t4[4], twR, twI;
+ float32_t p1ap3_0, p1sp3_0, p1ap3_1, p1sp3_1;
+ float32_t m0, m1, m2, m3;
+ uint32_t l, twMod2, twMod3, twMod4;
+
+ pCol1 = p1; /* points to real values by default */
+ pCol2 = p2;
+ pCol3 = p3;
+ pCol4 = p4;
+ pEnd1 = p2 - 1; /* points to imaginary values by default */
+ pEnd2 = p3 - 1;
+ pEnd3 = p4 - 1;
+ pEnd4 = pEnd3 + L;
+
+ tw2 = tw3 = tw4 = (float32_t *) S->pTwiddle;
+
+ L >>= 1;
+
+ /* do four dot Fourier transform */
+
+ twMod2 = 2;
+ twMod3 = 4;
+ twMod4 = 6;
+
+ /* TOP */
+ p1ap3_0 = p1[0] + p3[0];
+ p1sp3_0 = p1[0] - p3[0];
+ p1ap3_1 = p1[1] + p3[1];
+ p1sp3_1 = p1[1] - p3[1];
+
+ /* col 2 */
+ t2[0] = p1sp3_0 + p2[1] - p4[1];
+ t2[1] = p1sp3_1 - p2[0] + p4[0];
+ /* col 3 */
+ t3[0] = p1ap3_0 - p2[0] - p4[0];
+ t3[1] = p1ap3_1 - p2[1] - p4[1];
+ /* col 4 */
+ t4[0] = p1sp3_0 - p2[1] + p4[1];
+ t4[1] = p1sp3_1 + p2[0] - p4[0];
+ /* col 1 */
+ *p1++ = p1ap3_0 + p2[0] + p4[0];
+ *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+ /* Twiddle factors are ones */
+ *p2++ = t2[0];
+ *p2++ = t2[1];
+ *p3++ = t3[0];
+ *p3++ = t3[1];
+ *p4++ = t4[0];
+ *p4++ = t4[1];
+
+ tw2 += twMod2;
+ tw3 += twMod3;
+ tw4 += twMod4;
+
+ for (l = (L - 2) >> 1; l > 0; l-- )
+ {
+ /* TOP */
+ p1ap3_0 = p1[0] + p3[0];
+ p1sp3_0 = p1[0] - p3[0];
+ p1ap3_1 = p1[1] + p3[1];
+ p1sp3_1 = p1[1] - p3[1];
+ /* col 2 */
+ t2[0] = p1sp3_0 + p2[1] - p4[1];
+ t2[1] = p1sp3_1 - p2[0] + p4[0];
+ /* col 3 */
+ t3[0] = p1ap3_0 - p2[0] - p4[0];
+ t3[1] = p1ap3_1 - p2[1] - p4[1];
+ /* col 4 */
+ t4[0] = p1sp3_0 - p2[1] + p4[1];
+ t4[1] = p1sp3_1 + p2[0] - p4[0];
+ /* col 1 - top */
+ *p1++ = p1ap3_0 + p2[0] + p4[0];
+ *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+ /* BOTTOM */
+ p1ap3_1 = pEnd1[-1] + pEnd3[-1];
+ p1sp3_1 = pEnd1[-1] - pEnd3[-1];
+ p1ap3_0 = pEnd1[ 0] + pEnd3[0];
+ p1sp3_0 = pEnd1[ 0] - pEnd3[0];
+ /* col 2 */
+ t2[2] = pEnd2[0] - pEnd4[0] + p1sp3_1;
+ t2[3] = pEnd1[0] - pEnd3[0] - pEnd2[-1] + pEnd4[-1];
+ /* col 3 */
+ t3[2] = p1ap3_1 - pEnd2[-1] - pEnd4[-1];
+ t3[3] = p1ap3_0 - pEnd2[ 0] - pEnd4[ 0];
+ /* col 4 */
+ t4[2] = pEnd2[ 0] - pEnd4[ 0] - p1sp3_1;
+ t4[3] = pEnd4[-1] - pEnd2[-1] - p1sp3_0;
+ /* col 1 - Bottom */
+ *pEnd1-- = p1ap3_0 + pEnd2[ 0] + pEnd4[ 0];
+ *pEnd1-- = p1ap3_1 + pEnd2[-1] + pEnd4[-1];
+
+ /* COL 2 */
+ /* read twiddle factors */
+ twR = *tw2++;
+ twI = *tw2++;
+ /* multiply by twiddle factors */
+ /* let Z1 = a + i(b), Z2 = c + i(d) */
+ /* => Z1 * Z2 = (a*c - b*d) + i(b*c + a*d) */
+
+ /* Top */
+ m0 = t2[0] * twR;
+ m1 = t2[1] * twI;
+ m2 = t2[1] * twR;
+ m3 = t2[0] * twI;
+
+ *p2++ = m0 + m1;
+ *p2++ = m2 - m3;
+ /* use vertical symmetry col 2 */
+ /* 0.9997 - 0.0245i <==> 0.0245 - 0.9997i */
+ /* Bottom */
+ m0 = t2[3] * twI;
+ m1 = t2[2] * twR;
+ m2 = t2[2] * twI;
+ m3 = t2[3] * twR;
+
+ *pEnd2-- = m0 - m1;
+ *pEnd2-- = m2 + m3;
+
+ /* COL 3 */
+ twR = tw3[0];
+ twI = tw3[1];
+ tw3 += twMod3;
+ /* Top */
+ m0 = t3[0] * twR;
+ m1 = t3[1] * twI;
+ m2 = t3[1] * twR;
+ m3 = t3[0] * twI;
+
+ *p3++ = m0 + m1;
+ *p3++ = m2 - m3;
+ /* use vertical symmetry col 3 */
+ /* 0.9988 - 0.0491i <==> -0.9988 - 0.0491i */
+ /* Bottom */
+ m0 = -t3[3] * twR;
+ m1 = t3[2] * twI;
+ m2 = t3[2] * twR;
+ m3 = t3[3] * twI;
+
+ *pEnd3-- = m0 - m1;
+ *pEnd3-- = m3 - m2;
+
+ /* COL 4 */
+ twR = tw4[0];
+ twI = tw4[1];
+ tw4 += twMod4;
+ /* Top */
+ m0 = t4[0] * twR;
+ m1 = t4[1] * twI;
+ m2 = t4[1] * twR;
+ m3 = t4[0] * twI;
+
+ *p4++ = m0 + m1;
+ *p4++ = m2 - m3;
+ /* use vertical symmetry col 4 */
+ /* 0.9973 - 0.0736i <==> -0.0736 + 0.9973i */
+ /* Bottom */
+ m0 = t4[3] * twI;
+ m1 = t4[2] * twR;
+ m2 = t4[2] * twI;
+ m3 = t4[3] * twR;
+
+ *pEnd4-- = m0 - m1;
+ *pEnd4-- = m2 + m3;
+ }
+
+ /* MIDDLE */
+ /* Twiddle factors are */
+ /* 1.0000 0.7071-0.7071i -1.0000i -0.7071-0.7071i */
+ p1ap3_0 = p1[0] + p3[0];
+ p1sp3_0 = p1[0] - p3[0];
+ p1ap3_1 = p1[1] + p3[1];
+ p1sp3_1 = p1[1] - p3[1];
+
+ /* col 2 */
+ t2[0] = p1sp3_0 + p2[1] - p4[1];
+ t2[1] = p1sp3_1 - p2[0] + p4[0];
+ /* col 3 */
+ t3[0] = p1ap3_0 - p2[0] - p4[0];
+ t3[1] = p1ap3_1 - p2[1] - p4[1];
+ /* col 4 */
+ t4[0] = p1sp3_0 - p2[1] + p4[1];
+ t4[1] = p1sp3_1 + p2[0] - p4[0];
+ /* col 1 - Top */
+ *p1++ = p1ap3_0 + p2[0] + p4[0];
+ *p1++ = p1ap3_1 + p2[1] + p4[1];
+
+ /* COL 2 */
+ twR = tw2[0];
+ twI = tw2[1];
+
+ m0 = t2[0] * twR;
+ m1 = t2[1] * twI;
+ m2 = t2[1] * twR;
+ m3 = t2[0] * twI;
+
+ *p2++ = m0 + m1;
+ *p2++ = m2 - m3;
+ /* COL 3 */
+ twR = tw3[0];
+ twI = tw3[1];
+
+ m0 = t3[0] * twR;
+ m1 = t3[1] * twI;
+ m2 = t3[1] * twR;
+ m3 = t3[0] * twI;
+
+ *p3++ = m0 + m1;
+ *p3++ = m2 - m3;
+ /* COL 4 */
+ twR = tw4[0];
+ twI = tw4[1];
+
+ m0 = t4[0] * twR;
+ m1 = t4[1] * twI;
+ m2 = t4[1] * twR;
+ m3 = t4[0] * twI;
+
+ *p4++ = m0 + m1;
+ *p4++ = m2 - m3;
+
+ /* first col */
+ arm_radix8_butterfly_f32 (pCol1, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* second col */
+ arm_radix8_butterfly_f32 (pCol2, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* third col */
+ arm_radix8_butterfly_f32 (pCol3, L, (float32_t *) S->pTwiddle, 4U);
+
+ /* fourth col */
+ arm_radix8_butterfly_f32 (pCol4, L, (float32_t *) S->pTwiddle, 4U);
+}
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point complex FFT.
+ @param[in] S points to an instance of the floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+void arm_cfft_f32(
+ const arm_cfft_instance_f32 * S,
+ float32_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t L = S->fftLen, l;
+ float32_t invL, * pSrc;
+
+ if (ifftFlag == 1U)
+ {
+ /* Conjugate input data */
+ pSrc = p1 + 1;
+ for (l = 0; l < L; l++)
+ {
+ *pSrc = -*pSrc;
+ pSrc += 2;
+ }
+ }
+
+ switch (L)
+ {
+ case 16:
+ case 128:
+ case 1024:
+ arm_cfft_radix8by2_f32 ( (arm_cfft_instance_f32 *) S, p1);
+ break;
+ case 32:
+ case 256:
+ case 2048:
+ arm_cfft_radix8by4_f32 ( (arm_cfft_instance_f32 *) S, p1);
+ break;
+ case 64:
+ case 512:
+ case 4096:
+ arm_radix8_butterfly_f32 ( p1, L, (float32_t *) S->pTwiddle, 1);
+ break;
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
+
+ if (ifftFlag == 1U)
+ {
+ invL = 1.0f / (float32_t)L;
+
+ /* Conjugate and scale output data */
+ pSrc = p1;
+ for (l= 0; l < L; l++)
+ {
+ *pSrc++ *= invL ;
+ *pSrc = -(*pSrc) * invL;
+ pSrc++;
+ }
+ }
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c
new file mode 100644
index 0000000000..d686eb8c49
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_f64.c
@@ -0,0 +1,318 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_f64.c
+ * Description: Combined Radix Decimation in Frequency CFFT Double Precision Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+
+extern void arm_radix4_butterfly_f64(
+ float64_t * pSrc,
+ uint16_t fftLen,
+ const float64_t * pCoef,
+ uint16_t twidCoefModifier);
+
+extern void arm_bitreversal_64(
+ uint64_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
+
+/**
+* @} end of ComplexFFT group
+*/
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * ---------------------------------------------------------------------- */
+
+/*
+* @brief Core function for the Double Precision floating-point CFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of F64 data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to the twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_radix4_butterfly_f64(
+ float64_t * pSrc,
+ uint16_t fftLen,
+ const float64_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ float64_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
+
+ float64_t t1, t2, r1, r2, s1, s2;
+
+
+ /* Initializations for the fft calculation */
+ n2 = fftLen;
+ n1 = n2;
+ for (k = fftLen; k > 1U; k >>= 2U)
+ {
+ /* Initializations for the fft calculation */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* FFT Calculation */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* xa + xc */
+ r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+ /* xa - xc */
+ r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = r1 + t1;
+
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = s1 + t2;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
+
+ /* (xa - xc) + (yb - yd) */
+ r1 = r2 + t1;
+
+ /* (xa - xc) - (yb - yd) */
+ r2 = r2 - t1;
+
+ /* (ya - yc) - (xb - xd) */
+ s1 = s2 - t2;
+
+ /* (ya - yc) + (xb - xd) */
+ s2 = s2 + t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
+
+ i0 += n1;
+ } while ( i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+}
+
+/*
+* @brief Core function for the Double Precision floating-point CFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of F64 data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to the twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_cfft_radix4by2_f64(
+ float64_t * pSrc,
+ uint32_t fftLen,
+ const float64_t * pCoef)
+{
+ uint32_t i, l;
+ uint32_t n2, ia;
+ float64_t xt, yt, cosVal, sinVal;
+ float64_t p0, p1,p2,p3,a0,a1;
+
+ n2 = fftLen >> 1;
+ ia = 0;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2*ia];
+ sinVal = pCoef[2*ia + 1];
+ ia++;
+
+ l = i + n2;
+
+ /* Butterfly implementation */
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ }
+
+ // first col
+ arm_radix4_butterfly_f64( pSrc, n2, (float64_t*)pCoef, 2U);
+ // second col
+ arm_radix4_butterfly_f64( pSrc + fftLen, n2, (float64_t*)pCoef, 2U);
+
+}
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Double Precision floating-point complex FFT.
+ @param[in] S points to an instance of the Double Precision floating-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+void arm_cfft_f64(
+ const arm_cfft_instance_f64 * S,
+ float64_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t L = S->fftLen, l;
+ float64_t invL, * pSrc;
+
+ if (ifftFlag == 1U)
+ {
+ /* Conjugate input data */
+ pSrc = p1 + 1;
+ for(l=0; lpTwiddle, 1U);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_f64 ( p1, L, (float64_t*)S->pTwiddle);
+ break;
+
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_64((uint64_t*)p1, S->bitRevLength,S->pBitRevTable);
+
+ if (ifftFlag == 1U)
+ {
+ invL = 1.0 / (float64_t)L;
+ /* Conjugate and scale output data */
+ pSrc = p1;
+ for(l=0; lbitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \
+ S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \
+ S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the cfft f16 function
+ @param[in,out] S points to an instance of the floating-point CFFT structure
+ @param[in] fftLen fft length (number of complex samples)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+ @par Use of this function is mandatory only for the MVE version of the FFT.
+ Other versions can still initialize directly the data structure using
+ variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+#include "arm_const_structs_f16.h"
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+#include "arm_mve_tables_f16.h"
+
+arm_status arm_cfft_radix4by2_rearrange_twiddles_f16(arm_cfft_instance_f16 *S, int twidCoefModifier)
+{
+
+ switch (S->fftLen >> (twidCoefModifier - 1)) {
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+ case 4096U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f16;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_f16;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_f16;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_f16;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_f16;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_f16;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F16_1024) || defined(ARM_TABLE_TWIDDLECOEF_F16_2048)
+ case 1024U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f16;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_f16;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_f16;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_f16;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_f16;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_f16;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F16_256) || defined(ARM_TABLE_TWIDDLECOEF_F16_512)
+ case 256U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f16;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_f16;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_f16;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_f16;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_f16;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_f16;
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F16_64) || defined(ARM_TABLE_TWIDDLECOEF_F16_128)
+ case 64U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f16;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_f16;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_f16;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_f16;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_f16;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_f16;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F16_16) || defined(ARM_TABLE_TWIDDLECOEF_F16_32)
+ case 16U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f16;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_f16;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_f16;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_f16;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_f16;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_f16;
+ break;
+#endif
+
+ default:
+ return(ARM_MATH_ARGUMENT_ERROR);
+ break;
+ /* invalid sizes already filtered */
+ }
+
+ return(ARM_MATH_SUCCESS);
+
+}
+
+arm_status arm_cfft_init_f16(
+ arm_cfft_instance_f16 * S,
+ uint16_t fftLen)
+{
+
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+ /* Initializations of structure parameters for 4096 point FFT */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F16_4096))
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_4096;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F16_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_2048;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F16_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_1024;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F16_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_512;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F16_256))
+ case 256U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_256;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F16_128))
+ case 128U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_128;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F16_64))
+ case 64U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_64;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F16_32))
+ case 32U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_32;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F16_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
+ S->pTwiddle = (float16_t *)twiddleCoefF16_16;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f16(S, 1);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#else
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_init_f16(
+ arm_cfft_instance_f16 * S,
+ uint16_t fftLen)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f16,4096);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f16,2048);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f16,1024);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f16,512);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+ case 256U:
+ FFTINIT(f16,256);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+ case 128U:
+ FFTINIT(f16,128);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+ case 64U:
+ FFTINIT(f16,64);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+ case 32U:
+ FFTINIT(f16,32);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ FFTINIT(f16,16);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c
new file mode 100644
index 0000000000..bd35ef302e
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f32.c
@@ -0,0 +1,358 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_init_f32.c
+ * Description: Initialization function for cfft f32 instance
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define FFTINIT(EXT,SIZE) \
+ S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \
+ S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \
+ S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the cfft f32 function
+ @param[in,out] S points to an instance of the floating-point CFFT structure
+ @param[in] fftLen fft length (number of complex samples)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+ @par Use of this function is mandatory only for the MVE version of the FFT.
+ Other versions can still initialize directly the data structure using
+ variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+#include "arm_mve_tables.h"
+
+arm_status arm_cfft_radix4by2_rearrange_twiddles_f32(arm_cfft_instance_f32 *S, int twidCoefModifier)
+{
+
+ switch (S->fftLen >> (twidCoefModifier - 1)) {
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+ case 4096U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_f32;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_f32;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_f32;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_f32;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_f32;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_f32;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F32_1024) || defined(ARM_TABLE_TWIDDLECOEF_F32_2048)
+ case 1024U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_f32;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_f32;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_f32;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_f32;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_f32;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_f32;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F32_256) || defined(ARM_TABLE_TWIDDLECOEF_F32_512)
+ case 256U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_f32;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_f32;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_f32;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_f32;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_f32;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_f32;
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F32_64) || defined(ARM_TABLE_TWIDDLECOEF_F32_128)
+ case 64U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_f32;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_f32;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_f32;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_f32;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_f32;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_f32;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) \
+ || defined(ARM_TABLE_TWIDDLECOEF_F32_16) || defined(ARM_TABLE_TWIDDLECOEF_F32_32)
+ case 16U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_f32;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_f32;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_f32;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_f32;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_f32;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_f32;
+ break;
+#endif
+
+ default:
+ return(ARM_MATH_ARGUMENT_ERROR);
+ break;
+ /* invalid sizes already filtered */
+ }
+
+ return(ARM_MATH_SUCCESS);
+
+}
+
+arm_status arm_cfft_init_f32(
+ arm_cfft_instance_f32 * S,
+ uint16_t fftLen)
+{
+
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+ /* Initializations of structure parameters for 4096 point FFT */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_F32_4096))
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
+ S->pTwiddle = (float32_t *)twiddleCoef_4096;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_F32_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
+ S->pTwiddle = (float32_t *)twiddleCoef_2048;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_F32_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
+ S->pTwiddle = (float32_t *)twiddleCoef_1024;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_F32_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
+ S->pTwiddle = (float32_t *)twiddleCoef_512;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_F32_256))
+ case 256U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
+ S->pTwiddle = (float32_t *)twiddleCoef_256;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_F32_128))
+ case 128U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
+ S->pTwiddle = (float32_t *)twiddleCoef_128;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_F32_64))
+ case 64U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
+ S->pTwiddle = (float32_t *)twiddleCoef_64;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_F32_32))
+ case 32U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
+ S->pTwiddle = (float32_t *)twiddleCoef_32;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_F32_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
+ S->pTwiddle = (float32_t *)twiddleCoef_16;
+ status=arm_cfft_radix4by2_rearrange_twiddles_f32(S, 1);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#else
+arm_status arm_cfft_init_f32(
+ arm_cfft_instance_f32 * S,
+ uint16_t fftLen)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f32,4096);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f32,2048);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f32,1024);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f32,512);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+ case 256U:
+ FFTINIT(f32,256);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+ case 128U:
+ FFTINIT(f32,128);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+ case 64U:
+ FFTINIT(f32,64);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+ case 32U:
+ FFTINIT(f32,32);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ FFTINIT(f32,16);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c
new file mode 100644
index 0000000000..ba0e392abf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_f64.c
@@ -0,0 +1,150 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_init_f64.c
+ * Description: Initialization function for cfft f64 instance
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define FFTINIT(EXT,SIZE) \
+ S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \
+ S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \
+ S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the cfft f64 function
+ @param[in,out] S points to an instance of the floating-point CFFT structure
+ @param[in] fftLen fft length (number of complex samples)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+ @par Use of this function is mandatory only for the MVE version of the FFT.
+ Other versions can still initialize directly the data structure using
+ variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+
+arm_status arm_cfft_init_f64(
+ arm_cfft_instance_f64 * S,
+ uint16_t fftLen)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_4096) && defined(ARM_TABLE_BITREVIDX_FLT_4096))
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f64,4096);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f64,2048);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f64,1024);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_512) && defined(ARM_TABLE_BITREVIDX_FLT_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(f64,512);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_256) && defined(ARM_TABLE_BITREVIDX_FLT_256))
+ case 256U:
+ FFTINIT(f64,256);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_128) && defined(ARM_TABLE_BITREVIDX_FLT_128))
+ case 128U:
+ FFTINIT(f64,128);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_64) && defined(ARM_TABLE_BITREVIDX_FLT_64))
+ case 64U:
+ FFTINIT(f64,64);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_32) && defined(ARM_TABLE_BITREVIDX_FLT_32))
+ case 32U:
+ FFTINIT(f64,32);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_f64_16) && defined(ARM_TABLE_BITREVIDX_FLT_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ FFTINIT(f64,16);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
new file mode 100644
index 0000000000..44c09a70af
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q15.c
@@ -0,0 +1,356 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_init_q15.c
+ * Description: Initialization function for cfft q15 instance
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define FFTINIT(EXT,SIZE) \
+ S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \
+ S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \
+ S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the cfft q15 function
+ @param[in,out] S points to an instance of the floating-point CFFT structure
+ @param[in] fftLen fft length (number of complex samples)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+ @par Use of this function is mandatory only for the MVE version of the FFT.
+ Other versions can still initialize directly the data structure using
+ variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+#include "arm_mve_tables.h"
+
+
+arm_status arm_cfft_radix4by2_rearrange_twiddles_q15(arm_cfft_instance_q15 *S, int twidCoefModifier)
+{
+
+ switch (S->fftLen >> (twidCoefModifier - 1)) {
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096))
+ case 4096U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q15;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_q15;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_q15;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_q15;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_q15;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_q15;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048))
+ case 1024U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q15;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_q15;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_q15;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_q15;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_q15;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_q15;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512))
+ case 256U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q15;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_q15;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_q15;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_q15;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_q15;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_q15;
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128))
+ case 64U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q15;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_q15;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_q15;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_q15;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_q15;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_q15;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32))
+ case 16U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q15;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_q15;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_q15;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_q15;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_q15;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_q15;
+ break;
+#endif
+
+ default:
+ return(ARM_MATH_ARGUMENT_ERROR);
+ break;
+ /* invalid sizes already filtered */
+ }
+
+ return(ARM_MATH_SUCCESS);
+
+}
+
+
+
+arm_status arm_cfft_init_q15(
+ arm_cfft_instance_q15 * S,
+ uint16_t fftLen)
+{
+
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+ /* Initializations of structure parameters for 4096 point FFT */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q15_4096))
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
+ S->pTwiddle = (q15_t *)twiddleCoef_4096_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q15_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
+ S->pTwiddle = (q15_t *)twiddleCoef_2048_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q15_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
+ S->pTwiddle = (q15_t *)twiddleCoef_1024_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q15_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
+ S->pTwiddle = (q15_t *)twiddleCoef_512_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q15_256))
+ case 256U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
+ S->pTwiddle = (q15_t *)twiddleCoef_256_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q15_128))
+ case 128U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
+ S->pTwiddle = (q15_t *)twiddleCoef_128_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q15_64))
+ case 64U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
+ S->pTwiddle = (q15_t *)twiddleCoef_64_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q15_32))
+ case 32U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
+ S->pTwiddle = (q15_t *)twiddleCoef_32_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q15_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
+ S->pTwiddle = (q15_t *)twiddleCoef_16_q15;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q15(S, 1);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#else
+arm_status arm_cfft_init_q15(
+ arm_cfft_instance_q15 * S,
+ uint16_t fftLen)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q15,4096);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q15,2048);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q15,1024);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q15,512);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+ case 256U:
+ FFTINIT(q15,256);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+ case 128U:
+ FFTINIT(q15,128);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+ case 64U:
+ FFTINIT(q15,64);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+ case 32U:
+ FFTINIT(q15,32);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ FFTINIT(q15,16);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c
new file mode 100644
index 0000000000..9fe4a09342
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_init_q31.c
@@ -0,0 +1,356 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_init_q31.c
+ * Description: Initialization function for cfft q31 instance
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#define FFTINIT(EXT,SIZE) \
+ S->bitRevLength = arm_cfft_sR_##EXT##_len##SIZE.bitRevLength; \
+ S->pBitRevTable = arm_cfft_sR_##EXT##_len##SIZE.pBitRevTable; \
+ S->pTwiddle = arm_cfft_sR_##EXT##_len##SIZE.pTwiddle;
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the cfft q31 function
+ @param[in,out] S points to an instance of the floating-point CFFT structure
+ @param[in] fftLen fft length (number of complex samples)
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+
+ @par Use of this function is mandatory only for the MVE version of the FFT.
+ Other versions can still initialize directly the data structure using
+ variables declared in arm_const_structs.h
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+#include "arm_mve_tables.h"
+
+
+arm_status arm_cfft_radix4by2_rearrange_twiddles_q31(arm_cfft_instance_q31 *S, int twidCoefModifier)
+{
+
+ switch (S->fftLen >> (twidCoefModifier - 1)) {
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096))
+ case 4096U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_4096_q31;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_4096_q31;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_4096_q31;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_4096_q31;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_4096_q31;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_4096_q31;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024)) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048))
+ case 1024U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_1024_q31;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_1024_q31;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_1024_q31;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_1024_q31;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_1024_q31;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_1024_q31;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256)) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512))
+ case 256U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_256_q31;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_256_q31;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_256_q31;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_256_q31;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_256_q31;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_256_q31;
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64)) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128))
+ case 64U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_64_q31;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_64_q31;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_64_q31;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_64_q31;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_64_q31;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_64_q31;
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16)) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32))
+ case 16U:
+ S->rearranged_twiddle_tab_stride1_arr = rearranged_twiddle_tab_stride1_arr_16_q31;
+ S->rearranged_twiddle_stride1 = rearranged_twiddle_stride1_16_q31;
+
+ S->rearranged_twiddle_tab_stride2_arr = rearranged_twiddle_tab_stride2_arr_16_q31;
+ S->rearranged_twiddle_stride2 = rearranged_twiddle_stride2_16_q31;
+
+ S->rearranged_twiddle_tab_stride3_arr = rearranged_twiddle_tab_stride3_arr_16_q31;
+ S->rearranged_twiddle_stride3 = rearranged_twiddle_stride3_16_q31;
+ break;
+#endif
+
+ default:
+ return(ARM_MATH_ARGUMENT_ERROR);
+ break;
+ /* invalid sizes already filtered */
+ }
+
+ return(ARM_MATH_SUCCESS);
+
+}
+
+
+
+arm_status arm_cfft_init_q31(
+ arm_cfft_instance_q31 * S,
+ uint16_t fftLen)
+{
+
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+ /* Initializations of structure parameters for 4096 point FFT */
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_4096) && defined(ARM_TABLE_TWIDDLECOEF_Q31_4096))
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_4096_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_4096;
+ S->pTwiddle = (q31_t *)twiddleCoef_4096_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_2048) && defined(ARM_TABLE_TWIDDLECOEF_Q31_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_2048_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_2048;
+ S->pTwiddle = (q31_t *)twiddleCoef_2048_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_1024) && defined(ARM_TABLE_TWIDDLECOEF_Q31_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_1024_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_1024;
+ S->pTwiddle = (q31_t *)twiddleCoef_1024_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_512) && defined(ARM_TABLE_TWIDDLECOEF_Q31_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_512_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_512;
+ S->pTwiddle = (q31_t *)twiddleCoef_512_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_256) && defined(ARM_TABLE_TWIDDLECOEF_Q31_256))
+ case 256U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_256_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_256;
+ S->pTwiddle = (q31_t *)twiddleCoef_256_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_128) && defined(ARM_TABLE_TWIDDLECOEF_Q31_128))
+ case 128U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_128_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_128;
+ S->pTwiddle = (q31_t *)twiddleCoef_128_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_64) && defined(ARM_TABLE_TWIDDLECOEF_Q31_64))
+ case 64U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_64_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_64;
+ S->pTwiddle = (q31_t *)twiddleCoef_64_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_32) && defined(ARM_TABLE_TWIDDLECOEF_Q31_32))
+ case 32U:
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_32_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_32;
+ S->pTwiddle = (q31_t *)twiddleCoef_32_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 2);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_BITREVIDX_FXT_16) && defined(ARM_TABLE_TWIDDLECOEF_Q31_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->bitRevLength = ARMBITREVINDEXTABLE_FIXED_16_TABLE_LENGTH;
+ S->pBitRevTable = (uint16_t *)armBitRevIndexTable_fixed_16;
+ S->pTwiddle = (q31_t *)twiddleCoef_16_q31;
+ status=arm_cfft_radix4by2_rearrange_twiddles_q31(S, 1);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#else
+arm_status arm_cfft_init_q31(
+ arm_cfft_instance_q31 * S,
+ uint16_t fftLen)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = NULL;
+
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen) {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q31,4096);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q31,2048);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q31,1024);
+
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the bit reversal table modifier */
+ FFTINIT(q31,512);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+ case 256U:
+ FFTINIT(q31,256);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+ case 128U:
+ FFTINIT(q31,128);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+ case 64U:
+ FFTINIT(q31,64);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+ case 32U:
+ FFTINIT(q31,32);
+ break;
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ FFTINIT(q31,16);
+ break;
+#endif
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+
+ return (status);
+}
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
new file mode 100644
index 0000000000..9d4eb96cc4
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q15.c
@@ -0,0 +1,890 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_q15.c
+ * Description: Combined Radix Decimation in Q15 Frequency CFFT processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+
+
+static void _arm_radix4_butterfly_q15_mve(
+ const arm_cfft_instance_q15 * S,
+ q15_t *pSrc,
+ uint32_t fftLen)
+{
+ q15x8_t vecTmp0, vecTmp1;
+ q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ q15x8_t vecA, vecB, vecC, vecD;
+ q15x8_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
+ (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
+ };
+
+ /*
+ * Process first stages
+ * Each stage in middle stages provides two down scaling of the input
+ */
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+
+ for (int k = fftLen / 4u; k > 1; k >>= 2u)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ q15_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ q15_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ q15_t const *pW1, *pW2, *pW3;
+ q15_t *inA = pSrc + CMPLX_DIM * i * n1;
+ q15_t *inB = inA + n2 * CMPLX_DIM;
+ q15_t *inC = inB + n2 * CMPLX_DIM;
+ q15_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 4;
+ /*
+ * load 4 x q15 complex pair
+ */
+ vecA = vldrhq_s16(inA);
+ vecC = vldrhq_s16(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrhq_s16(inB);
+ vecD = vldrhq_s16(inD);
+
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vst1q(inA, vecTmp0);
+ inA += 8;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+
+ vst1q(inB, vecTmp1);
+ inB += 8;
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 8;
+
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 8;
+
+ vecA = vldrhq_s16(inA);
+ vecC = vldrhq_s16(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32 (strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
+
+ blkCnt = (fftLen >> 4);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
+ vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * pre-load for next iteration
+ */
+ vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
+
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+
+ blkCnt--;
+ }
+
+}
+
+static void arm_cfft_radix4by2_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
+{
+ uint32_t n2;
+ q15_t *pIn0;
+ q15_t *pIn1;
+ const q15_t *pCoef = S->pTwiddle;
+ uint32_t blkCnt;
+ q15x8_t vecIn0, vecIn1, vecSum, vecDiff;
+ q15x8_t vecCmplxTmp, vecTw;
+ q15_t const *pCoefVec;
+
+ n2 = fftLen >> 1;
+
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 4;
+
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn1 = *(q15x8_t *) pIn1;
+
+ vecIn0 = vecIn0 >> 1;
+ vecIn1 = vecIn1 >> 1;
+ vecSum = vhaddq(vecIn0, vecIn1);
+ vst1q(pIn0, vecSum);
+ pIn0 += 8;
+
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 8;
+
+ vecDiff = vhsubq(vecIn0, vecIn1);
+ vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 8;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_q15_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_q15_mve(S, pSrc + fftLen, n2);
+
+
+ pIn0 = pSrc;
+ blkCnt = (fftLen << 1) >> 3;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn0 = vecIn0 << 1;
+ vst1q(pIn0, vecIn0);
+ pIn0 += 8;
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (fftLen << 1) & 7;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn0 = vecIn0 << 1;
+ vstrhq_p(pIn0, vecIn0, p0);
+ }
+}
+
+static void _arm_radix4_butterfly_inverse_q15_mve(const arm_cfft_instance_q15 *S,q15_t *pSrc, uint32_t fftLen)
+{
+ q15x8_t vecTmp0, vecTmp1;
+ q15x8_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ q15x8_t vecA, vecB, vecC, vecD;
+ q15x8_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q15_t *), (4 - 16) * sizeof(q15_t *),
+ (8 - 16) * sizeof(q15_t *), (12 - 16) * sizeof(q15_t *)
+ };
+
+
+ /*
+ * Process first stages
+ * Each stage in middle stages provides two down scaling of the input
+ */
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+
+ for (int k = fftLen / 4u; k > 1; k >>= 2u)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ q15_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ q15_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ q15_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ q15_t const *pW1, *pW2, *pW3;
+ q15_t *inA = pSrc + CMPLX_DIM * i * n1;
+ q15_t *inB = inA + n2 * CMPLX_DIM;
+ q15_t *inC = inB + n2 * CMPLX_DIM;
+ q15_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 4;
+ /*
+ * load 4 x q15 complex pair
+ */
+ vecA = vldrhq_s16(inA);
+ vecC = vldrhq_s16(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrhq_s16(inB);
+ vecD = vldrhq_s16(inD);
+
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vst1q(inA, vecTmp0);
+ inA += 8;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+
+ vst1q(inB, vecTmp1);
+ inB += 8;
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+ vst1q(inC, vecTmp1);
+ inC += 8;
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 8;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+ vst1q(inD, vecTmp1);
+ inD += 8;
+
+ vecA = vldrhq_s16(inA);
+ vecC = vldrhq_s16(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
+
+ blkCnt = (fftLen >> 4);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecB = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 4);
+ vecD = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 12);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * pre-load for next iteration
+ */
+ vecA = (q15x8_t) vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = (q15x8_t) vldrwq_gather_base_s32(vecScGathAddr, 8);
+
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64, (q15x8_t) vecTmp0);
+
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 4, (q15x8_t) vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, (q15x8_t) vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 12, (q15x8_t) vecTmp0);
+
+ blkCnt--;
+ }
+}
+
+static void arm_cfft_radix4by2_inverse_q15_mve(const arm_cfft_instance_q15 *S, q15_t *pSrc, uint32_t fftLen)
+{
+ uint32_t n2;
+ q15_t *pIn0;
+ q15_t *pIn1;
+ const q15_t *pCoef = S->pTwiddle;
+
+ uint32_t blkCnt;
+ q15x8_t vecIn0, vecIn1, vecSum, vecDiff;
+ q15x8_t vecCmplxTmp, vecTw;
+ q15_t const *pCoefVec;
+
+ n2 = fftLen >> 1;
+
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ pCoefVec = pCoef;
+
+ blkCnt = n2 / 4;
+
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn1 = *(q15x8_t *) pIn1;
+
+ vecIn0 = vecIn0 >> 1;
+ vecIn1 = vecIn1 >> 1;
+ vecSum = vhaddq(vecIn0, vecIn1);
+ vst1q(pIn0, vecSum);
+ pIn0 += 8;
+
+ vecTw = vld1q(pCoefVec);
+ pCoefVec += 8;
+
+ vecDiff = vhsubq(vecIn0, vecIn1);
+ vecCmplxTmp = vqrdmlsdhq(vuninitializedq_s16() , vecDiff, vecTw);
+ vecCmplxTmp = vqrdmladhxq(vecCmplxTmp, vecDiff, vecTw);
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 8;
+
+ blkCnt--;
+ }
+
+
+ _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_inverse_q15_mve(S, pSrc + fftLen, n2);
+
+ pIn0 = pSrc;
+ blkCnt = (fftLen << 1) >> 3;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn0 = vecIn0 << 1;
+ vst1q(pIn0, vecIn0);
+ pIn0 += 8;
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (fftLen << 1) & 7;
+ while (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp16q(blkCnt);
+
+ vecIn0 = *(q15x8_t *) pIn0;
+ vecIn0 = vecIn0 << 1;
+ vstrhq_p(pIn0, vecIn0, p0);
+ }
+}
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for Q15 complex FFT.
+ @param[in] S points to an instance of Q15 CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+void arm_cfft_q15(
+ const arm_cfft_instance_q15 * S,
+ q15_t * pSrc,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_q15_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q15_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_q15_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q15_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
+ arm_bitreversal_16_inpl_mve((uint16_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+ }
+}
+
+#else
+
+extern void arm_radix4_butterfly_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint32_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint32_t twidCoefModifier);
+
+extern void arm_bitreversal_16(
+ uint16_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
+
+void arm_cfft_radix4by2_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef);
+
+void arm_cfft_radix4by2_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for Q15 complex FFT.
+ @param[in] S points to an instance of Q15 CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+
+void arm_cfft_q15(
+ const arm_cfft_instance_q15 * S,
+ q15_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t L = S->fftLen;
+
+ if (ifftFlag == 1U)
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_inverse_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q15 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+ else
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_q15 ( p1, L, (q15_t*)S->pTwiddle, 1 );
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q15 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_16 ((uint16_t*) p1, S->bitRevLength, S->pBitRevTable);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+void arm_cfft_radix4by2_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef)
+{
+ uint32_t i;
+ uint32_t n2;
+ q15_t p0, p1, p2, p3;
+#if defined (ARM_MATH_DSP)
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+ const q15_t *pC = pCoef;
+ q15_t *pSi = pSrc;
+ q15_t *pSl = pSrc + fftLen;
+#else
+ uint32_t l;
+ q15_t xt, yt, cosVal, sinVal;
+#endif
+
+ n2 = fftLen >> 1U;
+
+#if defined (ARM_MATH_DSP)
+
+ for (i = n2; i > 0; i--)
+ {
+ coeff = read_q15x2_ia ((q15_t **) &pC);
+
+ T = read_q15x2 (pSi);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+ S = read_q15x2 (pSl);
+ S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+ R = __QSUB16(T, S);
+
+ write_q15x2_ia (&pSi, __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16U;
+ out2 = __SMUSDX(coeff, R);
+#else
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ) );
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
+
+ l = i + n2;
+
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
+
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* first col */
+ arm_radix4_butterfly_q15( pSrc, n2, (q15_t*)pCoef, 2U);
+
+ /* second col */
+ arm_radix4_butterfly_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ p2 = pSrc[4 * i + 2];
+ p3 = pSrc[4 * i + 3];
+
+ p0 <<= 1U;
+ p1 <<= 1U;
+ p2 <<= 1U;
+ p3 <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = p2;
+ pSrc[4 * i + 3] = p3;
+ }
+
+}
+
+void arm_cfft_radix4by2_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef)
+{
+ uint32_t i;
+ uint32_t n2;
+ q15_t p0, p1, p2, p3;
+#if defined (ARM_MATH_DSP)
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+ const q15_t *pC = pCoef;
+ q15_t *pSi = pSrc;
+ q15_t *pSl = pSrc + fftLen;
+#else
+ uint32_t l;
+ q15_t xt, yt, cosVal, sinVal;
+#endif
+
+ n2 = fftLen >> 1U;
+
+#if defined (ARM_MATH_DSP)
+
+ for (i = n2; i > 0; i--)
+ {
+ coeff = read_q15x2_ia ((q15_t **) &pC);
+
+ T = read_q15x2 (pSi);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+ S = read_q15x2 (pSl);
+ S = __SHADD16(S, 0); /* this is just a SIMD arithmetic shift right by 1 */
+
+ R = __QSUB16(T, S);
+
+ write_q15x2_ia (&pSi, __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16U;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2_ia (&pSl, (q31_t)__PKHBT( out1, out2, 0 ));
+ }
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
+
+ l = i + n2;
+
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16U)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16U)) );
+
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16U)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16U)) );
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* first col */
+ arm_radix4_butterfly_inverse_q15( pSrc, n2, (q15_t*)pCoef, 2U);
+
+ /* second col */
+ arm_radix4_butterfly_inverse_q15( pSrc + fftLen, n2, (q15_t*)pCoef, 2U);
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ p2 = pSrc[4 * i + 2];
+ p3 = pSrc[4 * i + 3];
+
+ p0 <<= 1U;
+ p1 <<= 1U;
+ p2 <<= 1U;
+ p3 <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = p2;
+ pSrc[4 * i + 3] = p3;
+ }
+}
+
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
new file mode 100644
index 0000000000..a26927e327
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_q31.c
@@ -0,0 +1,845 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_q31.c
+ * Description: Combined Radix Decimation in Frequency CFFT fixed point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_vec_fft.h"
+
+
+static void _arm_radix4_butterfly_q31_mve(
+ const arm_cfft_instance_q31 * S,
+ q31_t *pSrc,
+ uint32_t fftLen)
+{
+ q31x4_t vecTmp0, vecTmp1;
+ q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ q31x4_t vecA, vecB, vecC, vecD;
+ q31x4_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
+ (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
+ };
+
+
+ /*
+ * Process first stages
+ * Each stage in middle stages provides two down scaling of the input
+ */
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+
+ for (int k = fftLen / 4u; k > 1; k >>= 2u)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ q31_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ q31_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+ q31_t const *pW1, *pW2, *pW3;
+ q31_t *inA = pSrc + CMPLX_DIM * i * n1;
+ q31_t *inB = inA + n2 * CMPLX_DIM;
+ q31_t *inC = inB + n2 * CMPLX_DIM;
+ q31_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 2;
+ /*
+ * load 2 x q31 complex pair
+ */
+ vecA = vldrwq_s32(inA);
+ vecC = vldrwq_s32(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrwq_s32(inB);
+ vecD = vldrwq_s32(inD);
+
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vst1q(inA, vecTmp0);
+ inA += 4;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+
+ vst1q(inB, vecTmp1);
+ inB += 4;
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+ vst1q(inC, vecTmp1);
+ inC += 4;
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxB(vecW, vecTmp0);
+ vst1q(inD, vecTmp1);
+ inD += 4;
+
+ vecA = vldrwq_s32(inA);
+ vecC = vldrwq_s32(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * End of 1st stages process
+ * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
+ * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
+ * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
+ * data is in 5.27(q27) format for the 16 point as there are no middle stages
+ */
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
+
+ blkCnt = (fftLen >> 3);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
+ vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * pre-load for next iteration
+ */
+ vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
+
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
+
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
+
+ blkCnt--;
+ }
+
+ /*
+ * output is in 11.21(q21) format for the 1024 point
+ * output is in 9.23(q23) format for the 256 point
+ * output is in 7.25(q25) format for the 64 point
+ * output is in 5.27(q27) format for the 16 point
+ */
+}
+
+
+static void arm_cfft_radix4by2_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
+{
+ uint32_t n2;
+ q31_t *pIn0;
+ q31_t *pIn1;
+ const q31_t *pCoef = S->pTwiddle;
+ uint32_t blkCnt;
+ q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
+ q31x4_t vecCmplxTmp, vecTw;
+
+ n2 = fftLen >> 1;
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+
+ blkCnt = n2 / 2;
+
+ while (blkCnt > 0U)
+ {
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn1 = vld1q_s32(pIn1);
+
+ vecIn0 = vecIn0 >> 1;
+ vecIn1 = vecIn1 >> 1;
+ vecSum = vhaddq(vecIn0, vecIn1);
+ vst1q(pIn0, vecSum);
+ pIn0 += 4;
+
+ vecTw = vld1q_s32(pCoef);
+ pCoef += 4;
+ vecDiff = vhsubq(vecIn0, vecIn1);
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FX_AxConjB(vecDiff, vecTw);
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 4;
+
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_q31_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_q31_mve(S, pSrc + fftLen, n2);
+
+ pIn0 = pSrc;
+ blkCnt = (fftLen << 1) >> 2;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn0 = vecIn0 << 1;
+ vst1q(pIn0, vecIn0);
+ pIn0 += 4;
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (fftLen << 1) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn0 = vecIn0 << 1;
+ vstrwq_p(pIn0, vecIn0, p0);
+ }
+
+}
+
+static void _arm_radix4_butterfly_inverse_q31_mve(
+ const arm_cfft_instance_q31 *S,
+ q31_t *pSrc,
+ uint32_t fftLen)
+{
+ q31x4_t vecTmp0, vecTmp1;
+ q31x4_t vecSum0, vecDiff0, vecSum1, vecDiff1;
+ q31x4_t vecA, vecB, vecC, vecD;
+ q31x4_t vecW;
+ uint32_t blkCnt;
+ uint32_t n1, n2;
+ uint32_t stage = 0;
+ int32_t iter = 1;
+ static const uint32_t strides[4] = {
+ (0 - 16) * sizeof(q31_t *), (1 - 16) * sizeof(q31_t *),
+ (8 - 16) * sizeof(q31_t *), (9 - 16) * sizeof(q31_t *)
+ };
+
+ /*
+ * Process first stages
+ * Each stage in middle stages provides two down scaling of the input
+ */
+ n2 = fftLen;
+ n1 = n2;
+ n2 >>= 2u;
+
+ for (int k = fftLen / 4u; k > 1; k >>= 2u)
+ {
+ for (int i = 0; i < iter; i++)
+ {
+ q31_t const *p_rearranged_twiddle_tab_stride2 =
+ &S->rearranged_twiddle_stride2[
+ S->rearranged_twiddle_tab_stride2_arr[stage]];
+ q31_t const *p_rearranged_twiddle_tab_stride3 = &S->rearranged_twiddle_stride3[
+ S->rearranged_twiddle_tab_stride3_arr[stage]];
+ q31_t const *p_rearranged_twiddle_tab_stride1 =
+ &S->rearranged_twiddle_stride1[
+ S->rearranged_twiddle_tab_stride1_arr[stage]];
+
+ q31_t const *pW1, *pW2, *pW3;
+ q31_t *inA = pSrc + CMPLX_DIM * i * n1;
+ q31_t *inB = inA + n2 * CMPLX_DIM;
+ q31_t *inC = inB + n2 * CMPLX_DIM;
+ q31_t *inD = inC + n2 * CMPLX_DIM;
+
+ pW1 = p_rearranged_twiddle_tab_stride1;
+ pW2 = p_rearranged_twiddle_tab_stride2;
+ pW3 = p_rearranged_twiddle_tab_stride3;
+
+ blkCnt = n2 / 2;
+ /*
+ * load 2 x q31 complex pair
+ */
+ vecA = vldrwq_s32(inA);
+ vecC = vldrwq_s32(inC);
+ while (blkCnt > 0U)
+ {
+ vecB = vldrwq_s32(inB);
+ vecD = vldrwq_s32(inD);
+
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * [ 1 1 1 1 ] * [ A B C D ]' .* 1
+ */
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vst1q(inA, vecTmp0);
+ inA += 4;
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'
+ */
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ /*
+ * [ 1 -1 1 -1 ] * [ A B C D ]'.* W2
+ */
+ vecW = vld1q(pW2);
+ pW2 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+
+ vst1q(inB, vecTmp1);
+ inB += 4;
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 -i -1 +i ] * [ A B C D ]'.* W1
+ */
+ vecW = vld1q(pW1);
+ pW1 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+ vst1q(inC, vecTmp1);
+ inC += 4;
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'
+ */
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ /*
+ * [ 1 +i -1 -i ] * [ A B C D ]'.* W3
+ */
+ vecW = vld1q(pW3);
+ pW3 += 4;
+ vecTmp1 = MVE_CMPLX_MULT_FX_AxConjB(vecTmp0, vecW);
+ vst1q(inD, vecTmp1);
+ inD += 4;
+
+ vecA = vldrwq_s32(inA);
+ vecC = vldrwq_s32(inC);
+
+ blkCnt--;
+ }
+ }
+ n1 = n2;
+ n2 >>= 2u;
+ iter = iter << 2;
+ stage++;
+ }
+
+ /*
+ * End of 1st stages process
+ * data is in 11.21(q21) format for the 1024 point as there are 3 middle stages
+ * data is in 9.23(q23) format for the 256 point as there are 2 middle stages
+ * data is in 7.25(q25) format for the 64 point as there are 1 middle stage
+ * data is in 5.27(q27) format for the 16 point as there are no middle stages
+ */
+
+ /*
+ * start of Last stage process
+ */
+ uint32x4_t vecScGathAddr = vld1q_u32(strides);
+ vecScGathAddr = vecScGathAddr + (uint32_t) pSrc;
+
+ /*
+ * load scheduling
+ */
+ vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
+
+ blkCnt = (fftLen >> 3);
+ while (blkCnt > 0U)
+ {
+ vecSum0 = vhaddq(vecA, vecC);
+ vecDiff0 = vhsubq(vecA, vecC);
+
+ vecB = vldrwq_gather_base_s32(vecScGathAddr, 8);
+ vecD = vldrwq_gather_base_s32(vecScGathAddr, 24);
+
+ vecSum1 = vhaddq(vecB, vecD);
+ vecDiff1 = vhsubq(vecB, vecD);
+ /*
+ * pre-load for next iteration
+ */
+ vecA = vldrwq_gather_base_wb_s32(&vecScGathAddr, 64);
+ vecC = vldrwq_gather_base_s32(vecScGathAddr, 16);
+
+ vecTmp0 = vhaddq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64, vecTmp0);
+
+ vecTmp0 = vhsubq(vecSum0, vecSum1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 8, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_ADD_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 16, vecTmp0);
+
+ vecTmp0 = MVE_CMPLX_SUB_FX_A_ixB(vecDiff0, vecDiff1);
+ vstrwq_scatter_base_s32(vecScGathAddr, -64 + 24, vecTmp0);
+
+ blkCnt--;
+ }
+ /*
+ * output is in 11.21(q21) format for the 1024 point
+ * output is in 9.23(q23) format for the 256 point
+ * output is in 7.25(q25) format for the 64 point
+ * output is in 5.27(q27) format for the 16 point
+ */
+}
+
+static void arm_cfft_radix4by2_inverse_q31_mve(const arm_cfft_instance_q31 *S, q31_t *pSrc, uint32_t fftLen)
+{
+ uint32_t n2;
+ q31_t *pIn0;
+ q31_t *pIn1;
+ const q31_t *pCoef = S->pTwiddle;
+
+ //uint16_t twidCoefModifier = arm_cfft_radix2_twiddle_factor(S->fftLen);
+ //q31_t twidIncr = (2 * twidCoefModifier * sizeof(q31_t));
+ uint32_t blkCnt;
+ //uint64x2_t vecOffs;
+ q31x4_t vecIn0, vecIn1, vecSum, vecDiff;
+ q31x4_t vecCmplxTmp, vecTw;
+
+ n2 = fftLen >> 1;
+
+ pIn0 = pSrc;
+ pIn1 = pSrc + fftLen;
+ //vecOffs[0] = 0;
+ //vecOffs[1] = (uint64_t) twidIncr;
+ blkCnt = n2 / 2;
+
+ while (blkCnt > 0U)
+ {
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn1 = vld1q_s32(pIn1);
+
+ vecIn0 = vecIn0 >> 1;
+ vecIn1 = vecIn1 >> 1;
+ vecSum = vhaddq(vecIn0, vecIn1);
+ vst1q(pIn0, vecSum);
+ pIn0 += 4;
+
+ //vecTw = (q31x4_t) vldrdq_gather_offset_s64(pCoef, vecOffs);
+ vecTw = vld1q_s32(pCoef);
+ pCoef += 4;
+ vecDiff = vhsubq(vecIn0, vecIn1);
+
+ vecCmplxTmp = MVE_CMPLX_MULT_FX_AxB(vecDiff, vecTw);
+ vst1q(pIn1, vecCmplxTmp);
+ pIn1 += 4;
+
+ //vecOffs = vaddq((q31x4_t) vecOffs, 2 * twidIncr);
+ blkCnt--;
+ }
+
+ _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, n2);
+
+ _arm_radix4_butterfly_inverse_q31_mve(S, pSrc + fftLen, n2);
+
+ pIn0 = pSrc;
+ blkCnt = (fftLen << 1) >> 2;
+ while (blkCnt > 0U)
+ {
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn0 = vecIn0 << 1;
+ vst1q(pIn0, vecIn0);
+ pIn0 += 4;
+ blkCnt--;
+ }
+ /*
+ * tail
+ * (will be merged thru tail predication)
+ */
+ blkCnt = (fftLen << 1) & 3;
+ if (blkCnt > 0U)
+ {
+ mve_pred16_t p0 = vctp32q(blkCnt);
+
+ vecIn0 = vld1q_s32(pIn0);
+ vecIn0 = vecIn0 << 1;
+ vstrwq_p(pIn0, vecIn0, p0);
+ }
+
+}
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 complex FFT.
+ @param[in] S points to an instance of the fixed-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+void arm_cfft_q31(
+ const arm_cfft_instance_q31 * S,
+ q31_t * pSrc,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t fftLen = S->fftLen;
+
+ if (ifftFlag == 1U) {
+
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_inverse_q31_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q31_mve(S, pSrc, fftLen);
+ break;
+ }
+ } else {
+ switch (fftLen) {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ _arm_radix4_butterfly_q31_mve(S, pSrc, fftLen);
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q31_mve(S, pSrc, fftLen);
+ break;
+ }
+ }
+
+
+ if (bitReverseFlag)
+ {
+
+ arm_bitreversal_32_inpl_mve((uint32_t*)pSrc, S->bitRevLength, S->pBitRevTable);
+
+ }
+}
+#else
+
+extern void arm_radix4_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
+
+extern void arm_bitreversal_32(
+ uint32_t * pSrc,
+ const uint16_t bitRevLen,
+ const uint16_t * pBitRevTable);
+
+void arm_cfft_radix4by2_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef);
+
+void arm_cfft_radix4by2_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef);
+
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 complex FFT.
+ @param[in] S points to an instance of the fixed-point CFFT structure
+ @param[in,out] p1 points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return none
+ */
+void arm_cfft_q31(
+ const arm_cfft_instance_q31 * S,
+ q31_t * p1,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ uint32_t L = S->fftLen;
+
+ if (ifftFlag == 1U)
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_inverse_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_inverse_q31 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+ else
+ {
+ switch (L)
+ {
+ case 16:
+ case 64:
+ case 256:
+ case 1024:
+ case 4096:
+ arm_radix4_butterfly_q31 ( p1, L, (q31_t*)S->pTwiddle, 1 );
+ break;
+
+ case 32:
+ case 128:
+ case 512:
+ case 2048:
+ arm_cfft_radix4by2_q31 ( p1, L, S->pTwiddle );
+ break;
+ }
+ }
+
+ if ( bitReverseFlag )
+ arm_bitreversal_32 ((uint32_t*) p1, S->bitRevLength, S->pBitRevTable);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+void arm_cfft_radix4by2_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef)
+{
+ uint32_t i, l;
+ uint32_t n2;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
+
+ l = i + n2;
+
+ xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+ pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
+
+ yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multAcc_32x32_keep32_R(p0, yt, sinVal);
+ multSub_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2 * l] = p0 << 1;
+ pSrc[2 * l + 1] = p1 << 1;
+ }
+
+
+ /* first col */
+ arm_radix4_butterfly_q31 (pSrc, n2, (q31_t*)pCoef, 2U);
+
+ /* second col */
+ arm_radix4_butterfly_q31 (pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ xt = pSrc[4 * i + 2];
+ yt = pSrc[4 * i + 3];
+
+ p0 <<= 1U;
+ p1 <<= 1U;
+ xt <<= 1U;
+ yt <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = xt;
+ pSrc[4 * i + 3] = yt;
+ }
+
+}
+
+void arm_cfft_radix4by2_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef)
+{
+ uint32_t i, l;
+ uint32_t n2;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2 * i];
+ sinVal = pCoef[2 * i + 1];
+
+ l = i + n2;
+
+ xt = (pSrc[2 * i] >> 2U) - (pSrc[2 * l] >> 2U);
+ pSrc[2 * i] = (pSrc[2 * i] >> 2U) + (pSrc[2 * l] >> 2U);
+
+ yt = (pSrc[2 * i + 1] >> 2U) - (pSrc[2 * l + 1] >> 2U);
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] >> 2U) + (pSrc[2 * i + 1] >> 2U);
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multSub_32x32_keep32_R(p0, yt, sinVal);
+ multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2 * l] = p0 << 1U;
+ pSrc[2 * l + 1] = p1 << 1U;
+ }
+
+ /* first col */
+ arm_radix4_butterfly_inverse_q31( pSrc, n2, (q31_t*)pCoef, 2U);
+
+ /* second col */
+ arm_radix4_butterfly_inverse_q31( pSrc + fftLen, n2, (q31_t*)pCoef, 2U);
+
+ n2 = fftLen >> 1U;
+ for (i = 0; i < n2; i++)
+ {
+ p0 = pSrc[4 * i + 0];
+ p1 = pSrc[4 * i + 1];
+ xt = pSrc[4 * i + 2];
+ yt = pSrc[4 * i + 3];
+
+ p0 <<= 1U;
+ p1 <<= 1U;
+ xt <<= 1U;
+ yt <<= 1U;
+
+ pSrc[4 * i + 0] = p0;
+ pSrc[4 * i + 1] = p1;
+ pSrc[4 * i + 2] = xt;
+ pSrc[4 * i + 3] = yt;
+ }
+}
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c
new file mode 100644
index 0000000000..c4e76809b0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f16.c
@@ -0,0 +1,475 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_f16.c
+ * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+void arm_radix2_butterfly_f16(
+ float16_t * pSrc,
+ uint32_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_f16(
+ float16_t * pSrc,
+ uint32_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier,
+ float16_t onebyfftLen);
+
+extern void arm_bitreversal_f16(
+ float16_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Radix-2 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future
+ @param[in] S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix2_f16(
+const arm_cfft_radix2_instance_f16 * S,
+ float16_t * pSrc)
+{
+
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-2 */
+ arm_radix2_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier, S->onebyfftLen);
+ }
+ else
+ {
+ /* Complex FFT radix-2 */
+ arm_radix2_butterfly_f16(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+
+/**
+ @} end of ComplexFFT group
+ */
+
+
+
+/* ----------------------------------------------------------------------
+** Internal helper function used by the FFTs
+** ------------------------------------------------------------------- */
+
+/*
+* @brief Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to the twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_radix2_butterfly_f16(
+float16_t * pSrc,
+uint32_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier)
+{
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float16_t xt, yt, cosVal, sinVal;
+ float16_t p0, p1, p2, p3;
+ float16_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+ /* Initializations for the first stage */
+ n2 = fftLen >> 1;
+ ia = 0;
+ i = 0;
+
+ // loop for groups
+ for (k = n2; k > 0; k--)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ /* Twiddle coefficients index modifier */
+ ia += twidCoefModifier;
+
+ /* index calculation for the input as, */
+ /* pSrc[i + 0], pSrc[i + fftLen/1] */
+ l = i + n2;
+
+ /* Butterfly implementation */
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i++;
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+
+ // loop for stage
+ for (k = n2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while ( j < n2); // groups loop end
+ twidCoefModifier <<= 1U;
+ } // stages loop end
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += 2)
+ {
+ a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+ xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+ a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+ pSrc[2 * i + 2] = xt;
+ pSrc[2 * i + 3] = yt;
+ } // groups loop end
+
+#else
+
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 1; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i += n1;
+ } while (i < fftLen);
+ j++;
+ } while (j < n2);
+ twidCoefModifier <<= 1U;
+ }
+
+#endif // #if defined (ARM_MATH_DSP)
+
+}
+
+
+void arm_radix2_butterfly_inverse_f16(
+float16_t * pSrc,
+uint32_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier,
+float16_t onebyfftLen)
+{
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float16_t xt, yt, cosVal, sinVal;
+ float16_t p0, p1, p2, p3;
+ float16_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+ n2 = fftLen >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while (j < n2); // groups loop end
+
+ twidCoefModifier <<= 1U;
+ } // stages loop end
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += 2)
+ {
+ a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+ xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+ a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+ yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+
+ p0 = a0 * onebyfftLen;
+ p2 = xt * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p3 = yt * onebyfftLen;
+
+ pSrc[2 * i] = p0;
+ pSrc[2 * i + 1] = p1;
+ pSrc[2 * i + 2] = p2;
+ pSrc[2 * i + 3] = p3;
+ } // butterfly loop end
+
+#else
+
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while ( j < n2 ); // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+
+ p0 = a0 * onebyfftLen;
+ p2 = xt * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p3 = yt * onebyfftLen;
+
+ pSrc[2 * i] = p0;
+ pSrc[2U * l] = p2;
+
+ pSrc[2 * i + 1] = p1;
+ pSrc[2U * l + 1U] = p3;
+ } // butterfly loop end
+
+#endif // #if defined (ARM_MATH_DSP)
+
+}
+
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
new file mode 100644
index 0000000000..ab218a50bd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_f32.c
@@ -0,0 +1,470 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_f32.c
+ * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+void arm_radix2_butterfly_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen);
+
+extern void arm_bitreversal_f32(
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Radix-2 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future
+ @param[in] S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix2_f32(
+const arm_cfft_radix2_instance_f32 * S,
+ float32_t * pSrc)
+{
+
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-2 */
+ arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier, S->onebyfftLen);
+ }
+ else
+ {
+ /* Complex FFT radix-2 */
+ arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+
+/**
+ @} end of ComplexFFT group
+ */
+
+
+
+/* ----------------------------------------------------------------------
+ ** Internal helper function used by the FFTs
+ ** ------------------------------------------------------------------- */
+
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ return none
+ */
+
+void arm_radix2_butterfly_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+ float32_t p0, p1, p2, p3;
+ float32_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+ /* Initializations for the first stage */
+ n2 = fftLen >> 1;
+ ia = 0;
+ i = 0;
+
+ // loop for groups
+ for (k = n2; k > 0; k--)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ /* Twiddle coefficients index modifier */
+ ia += twidCoefModifier;
+
+ /* index calculation for the input as, */
+ /* pSrc[i + 0], pSrc[i + fftLen/1] */
+ l = i + n2;
+
+ /* Butterfly implementation */
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i++;
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+
+ // loop for stage
+ for (k = n2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while ( j < n2); // groups loop end
+ twidCoefModifier <<= 1U;
+ } // stages loop end
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += 2)
+ {
+ a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+ xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+ a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+ pSrc[2 * i + 2] = xt;
+ pSrc[2 * i + 3] = yt;
+ } // groups loop end
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 1; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ i += n1;
+ } while (i < fftLen);
+ j++;
+ } while (j < n2);
+ twidCoefModifier <<= 1U;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+void arm_radix2_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen)
+{
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+ float32_t p0, p1, p2, p3;
+ float32_t a0, a1;
+
+#if defined (ARM_MATH_DSP)
+
+ n2 = fftLen >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia += twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while (j < n2); // groups loop end
+
+ twidCoefModifier <<= 1U;
+ } // stages loop end
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += 2)
+ {
+ a0 = pSrc[2 * i] + pSrc[2 * i + 2];
+ xt = pSrc[2 * i] - pSrc[2 * i + 2];
+
+ a1 = pSrc[2 * i + 3] + pSrc[2 * i + 1];
+ yt = pSrc[2 * i + 1] - pSrc[2 * i + 3];
+
+ p0 = a0 * onebyfftLen;
+ p2 = xt * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p3 = yt * onebyfftLen;
+
+ pSrc[2 * i] = p0;
+ pSrc[2 * i + 1] = p1;
+ pSrc[2 * i + 2] = p2;
+ pSrc[2 * i + 3] = p3;
+ } // butterfly loop end
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ j = 0;
+ do
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ do
+ {
+ l = i + n2;
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 - p1;
+ pSrc[2 * l + 1] = p2 + p3;
+
+ i += n1;
+ } while ( i < fftLen ); // butterfly loop end
+ j++;
+ } while ( j < n2 ); // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+
+ p0 = a0 * onebyfftLen;
+ p2 = xt * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p3 = yt * onebyfftLen;
+
+ pSrc[2 * i] = p0;
+ pSrc[2 * l] = p2;
+
+ pSrc[2 * i + 1] = p1;
+ pSrc[2 * l + 1] = p3;
+ } // butterfly loop end
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c
new file mode 100644
index 0000000000..f536a22533
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f16.c
@@ -0,0 +1,214 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_init_f16.c
+ * Description: Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables.h"
+#include "arm_common_tables_f16.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_radix2_init_f16(
+ arm_cfft_radix2_instance_f16 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (float16_t *) twiddleCoefF16_4096;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.000244140625;
+ break;
+
+ case 2048U:
+ /* Initializations of structure parameters for 2048 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.00048828125;
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.0009765625f;
+ break;
+
+ case 512U:
+ /* Initializations of structure parameters for 512 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 8U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 8U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.001953125;
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ S->onebyfftLen = 0.00390625f;
+ break;
+
+ case 128U:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32U;
+ S->bitRevFactor = 32U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+ S->onebyfftLen = 0.0078125;
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ S->onebyfftLen = 0.015625f;
+ break;
+
+ case 32U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 128U;
+ S->bitRevFactor = 128U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+ S->onebyfftLen = 0.03125;
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ S->onebyfftLen = 0.0625f;
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
new file mode 100644
index 0000000000..ae9f29a8a7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
@@ -0,0 +1,209 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_init_f32.c
+ * Description: Radix-2 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_f32(
+ arm_cfft_radix2_instance_f32 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (float32_t *) twiddleCoef;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.000244140625;
+ break;
+
+ case 2048U:
+ /* Initializations of structure parameters for 2048 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.00048828125;
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.0009765625f;
+ break;
+
+ case 512U:
+ /* Initializations of structure parameters for 512 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 8U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 8U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.001953125;
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ S->onebyfftLen = 0.00390625f;
+ break;
+
+ case 128U:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32U;
+ S->bitRevFactor = 32U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+ S->onebyfftLen = 0.0078125;
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ S->onebyfftLen = 0.015625f;
+ break;
+
+ case 32U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 128U;
+ S->bitRevFactor = 128U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+ S->onebyfftLen = 0.03125;
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ S->onebyfftLen = 0.0625f;
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
new file mode 100644
index 0000000000..68c99308d9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
@@ -0,0 +1,194 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_init_q15.c
+ * Description: Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed
+ @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure.
+ @param[in] fftLen length of the FFT.
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q15(
+ arm_cfft_radix2_instance_q15 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+
+ break;
+
+ case 2048U:
+ /* Initializations of structure parameters for 2048 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+ S->twidCoefModifier = 4U;
+ S->bitRevFactor = 4U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+
+ break;
+
+ case 512U:
+ /* Initializations of structure parameters for 512 point FFT */
+ S->twidCoefModifier = 8U;
+ S->bitRevFactor = 8U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+
+ break;
+
+ case 128U:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32U;
+ S->bitRevFactor = 32U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+
+ break;
+
+ case 32U:
+ /* Initializations of structure parameters for 32 point FFT */
+ S->twidCoefModifier = 128U;
+ S->bitRevFactor = 128U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+
+ break;
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
new file mode 100644
index 0000000000..4dc39f7d66
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
@@ -0,0 +1,191 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_init_q31.c
+ * Description: Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q31(
+ arm_cfft_radix2_instance_q31 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen)
+ {
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ break;
+
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+ break;
+
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ break;
+
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 8U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 8U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ break;
+
+ case 128U:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32U;
+ S->bitRevFactor = 32U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ break;
+
+ case 32U:
+ /* Initializations of structure parameters for 32 point FFT */
+ S->twidCoefModifier = 128U;
+ S->bitRevFactor = 128U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
new file mode 100644
index 0000000000..ca15ea18b7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q15.c
@@ -0,0 +1,689 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_q15.c
+ * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+void arm_radix2_butterfly_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_bitreversal_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the fixed-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix2_q15(
+ const arm_cfft_radix2_instance_q15 * S,
+ q15_t * pSrc)
+{
+
+ if (S->ifftFlag == 1U)
+ {
+ arm_radix2_butterfly_inverse_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ arm_radix2_butterfly_q15 (pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+
+ arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+void arm_radix2_butterfly_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+#if defined (ARM_MATH_DSP)
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t in;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 1;
+ T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = read_q15x2 (pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 1;
+ S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+#else
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ i++;
+ l++;
+
+ T = read_q15x2 (pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 1;
+ T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = read_q15x2 (pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 1;
+ S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+#else
+
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+ /* loop for stage */
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+#else
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ i += n1;
+
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+#else
+ out1 = __SMUSDX(R, coeff) >> 16U;
+ out2 = __SMUAD(coeff, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2U * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } /* stages loop end */
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+ write_q15x2 (pSrc + (2 * l), R);
+
+ i += n1;
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+ write_q15x2 (pSrc + (2 * l), R);
+
+ } /* groups loop end */
+
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t xt, yt, cosVal, sinVal;
+
+
+ // N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[(ia * 2)];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+ (pSrc[2 * i + 1] >> 1U) ) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+ /* loop for stage */
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2U * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } /* stages loop end */
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2 * l] = xt;
+
+ pSrc[2 * l + 1] = yt;
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+void arm_radix2_butterfly_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+#if defined (ARM_MATH_DSP)
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t in;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+
+ // N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (i = 0; i < n2; i++)
+ {
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 1;
+ T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = read_q15x2 (pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 1;
+ S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ i++;
+ l++;
+
+ T = read_q15x2 (pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 1;
+ T = ((T >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = read_q15x2 (pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 1;
+ S = ((S >> 1) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+ /* loop for stage */
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ i += n1;
+
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __SHADD16(T, S));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+ out1 = __SMUADX(R, coeff) >> 16U;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ write_q15x2 (pSrc + (2 * l), (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF));
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } /* stages loop end */
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ coeff = read_q15x2 ((q15_t *)pCoef + (ia * 2U));
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = read_q15x2 (pSrc + (2 * i));
+
+ S = read_q15x2 (pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ write_q15x2 (pSrc + (2 * i), __QADD16(T, S));
+
+ write_q15x2 (pSrc + (2 * l), R);
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ uint32_t i, j, k, l;
+ uint32_t n1, n2, ia;
+ q15_t xt, yt, cosVal, sinVal;
+
+ // N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[(ia * 2)];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] = ((pSrc[2 * l + 1] >> 1U) +
+ (pSrc[2 * i + 1] >> 1U) ) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+ /* loop for stage */
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ /* loop for groups */
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[(ia * 2)];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+ pSrc[2 * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)) );
+
+ pSrc[2 * l + 1] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)) );
+
+ } /* butterfly loop end */
+
+ } /* groups loop end */
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } /* stages loop end */
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[(ia * 2)];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ ia = ia + twidCoefModifier;
+
+ /* loop for butterfly */
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2 * l] = xt;
+
+ pSrc[2 * l + 1] = yt;
+
+ } /* groups loop end */
+
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
new file mode 100644
index 0000000000..996e91d3a7
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix2_q31.c
@@ -0,0 +1,337 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix2_q31.c
+ * Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+void arm_radix2_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix2_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_bitreversal_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the fixed-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in] S points to an instance of the fixed-point CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix2_q31(
+ const arm_cfft_radix2_instance_q31 * S,
+ q31_t * pSrc)
+{
+
+ if (S->ifftFlag == 1U)
+ {
+ arm_radix2_butterfly_inverse_q31(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ arm_radix2_butterfly_q31(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+
+ arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+void arm_radix2_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ unsigned i, j, k, l, m;
+ unsigned n1, n2, ia;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multAcc_32x32_keep32_R(p0, yt, sinVal);
+ multSub_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2U * l] = p0;
+ pSrc[2U * l + 1U] = p1;
+
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ i = j;
+ m = fftLen / n1;
+ do
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multAcc_32x32_keep32_R(p0, yt, sinVal);
+ multSub_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2U * l] = p0;
+ pSrc[2U * l + 1U] = p1;
+ i += n1;
+ m--;
+ } while ( m > 0); // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier <<= 1U;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2U * l] = xt;
+
+ pSrc[2U * l + 1U] = yt;
+
+ i += n1;
+ l = i + n2;
+
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2U * l] = xt;
+
+ pSrc[2U * l + 1U] = yt;
+
+ } // butterfly loop end
+
+}
+
+
+void arm_radix2_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ unsigned i, j, k, l;
+ unsigned n1, n2, ia;
+ q31_t xt, yt, cosVal, sinVal;
+ q31_t p0, p1;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 1U) - (pSrc[2 * l] >> 1U);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 1U) + (pSrc[2 * l] >> 1U)) >> 1U;
+
+ yt = (pSrc[2 * i + 1] >> 1U) - (pSrc[2 * l + 1] >> 1U);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 1U) + (pSrc[2 * i + 1] >> 1U)) >> 1U;
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multSub_32x32_keep32_R(p0, yt, sinVal);
+ multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2U * l] = p0;
+ pSrc[2U * l + 1U] = p1;
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1U;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1U;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1U;
+
+ mult_32x32_keep32_R(p0, xt, cosVal);
+ mult_32x32_keep32_R(p1, yt, cosVal);
+ multSub_32x32_keep32_R(p0, yt, sinVal);
+ multAcc_32x32_keep32_R(p1, xt, sinVal);
+
+ pSrc[2U * l] = p0;
+ pSrc[2U * l + 1U] = p1;
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1U;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2U * l] = xt;
+
+ pSrc[2U * l + 1U] = yt;
+
+ i += n1;
+ l = i + n2;
+
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2U * l] = xt;
+
+ pSrc[2U * l + 1U] = yt;
+
+ } // butterfly loop end
+
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c
new file mode 100644
index 0000000000..721b915b86
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f16.c
@@ -0,0 +1,1272 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_f16.c
+ * Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+extern void arm_bitreversal_f16(
+ float16_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+void arm_radix4_butterfly_f16(
+ float16_t * pSrc,
+ uint16_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_f16(
+ float16_t * pSrc,
+ uint16_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier,
+ float16_t onebyfftLen);
+
+
+void arm_cfft_radix4by2_f16(
+ float16_t * pSrc,
+ uint32_t fftLen,
+ const float16_t * pCoef);
+
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/*
+* @brief Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to the twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_cfft_radix4by2_f16(
+ float16_t * pSrc,
+ uint32_t fftLen,
+ const float16_t * pCoef)
+{
+ uint32_t i, l;
+ uint32_t n2, ia;
+ float16_t xt, yt, cosVal, sinVal;
+ float16_t p0, p1,p2,p3,a0,a1;
+
+ n2 = fftLen >> 1;
+ ia = 0;
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[2*ia];
+ sinVal = pCoef[2*ia + 1];
+ ia++;
+
+ l = i + n2;
+
+ /* Butterfly implementation */
+ a0 = pSrc[2 * i] + pSrc[2 * l];
+ xt = pSrc[2 * i] - pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ a1 = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ p0 = xt * cosVal;
+ p1 = yt * sinVal;
+ p2 = yt * cosVal;
+ p3 = xt * sinVal;
+
+ pSrc[2 * i] = a0;
+ pSrc[2 * i + 1] = a1;
+
+ pSrc[2 * l] = p0 + p1;
+ pSrc[2 * l + 1] = p2 - p3;
+
+ }
+
+ // first col
+ arm_radix4_butterfly_f16( pSrc, n2, (float16_t*)pCoef, 2U);
+ // second col
+ arm_radix4_butterfly_f16( pSrc + fftLen, n2, (float16_t*)pCoef, 2U);
+
+}
+
+
+/**
+ @brief Processing function for the floating-point Radix-4 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f16 and will be removed in the future.
+ @param[in] S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix4_f16(
+ const arm_cfft_radix4_instance_f16 * S,
+ float16_t * pSrc)
+{
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-4 */
+ arm_radix4_butterfly_inverse_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
+ }
+ else
+ {
+ /* Complex FFT radix-4 */
+ arm_radix4_butterfly_f16(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_f16(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * ---------------------------------------------------------------------- */
+
+/*
+* @brief Core function for the floating-point CFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to the twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @return none.
+*/
+
+void arm_radix4_butterfly_f16(
+float16_t * pSrc,
+uint16_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier)
+{
+
+ float16_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_DSP)
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float16_t *ptr1;
+ float16_t p0,p1,p2,p3,p4,p5;
+ float16_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ /* Calculation of first stage */
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* xb - xd */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* yb - yd */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ /* (xa - xc) + (yb - yd) */
+ Xb12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc - Xbminusd);
+ /* (xa + xc) - (xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) - (yb - yd) */
+ Xd12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yd12C_out = (Xbminusd + Yaminusc);
+
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* index calculation for the coefficients */
+ ia3 = ia2 + ia1;
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out += p0;
+ Yb12_out -= p1;
+ Xc12_out += p2;
+ Yc12_out -= p3;
+ Xd12_out += p4;
+ Yd12_out -= p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ /* Twiddle coefficients index modifier */
+ ia1 += twidCoefModifier;
+
+ /* Updating input index */
+ i0++;
+
+ }
+ while (--j);
+
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of second stage to excluding last stage */
+ for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 += twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* (xb - xd) */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* (yb - yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xa - xc) + (yb - yd) */
+ Xb12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yb12C_out = (Yaminusc - Xbminusd);
+ /* xa + xc -(xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) - (yb - yd) */
+ Xd12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yd12C_out = (Xbminusd + Yaminusc);
+
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out += p0;
+ Yb12_out -= p1;
+ Xc12_out += p2;
+ Yc12_out -= p3;
+ Xd12_out += p4;
+ Yd12_out -= p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ i0 += n1;
+ } while (i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ xaIn = ptr1[0];
+ yaIn = ptr1[1];
+ xbIn = ptr1[2];
+ ybIn = ptr1[3];
+ xcIn = ptr1[4];
+ ycIn = ptr1[5];
+ xdIn = ptr1[6];
+ ydIn = ptr1[7];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xb-xd) */
+ Xbminusd = xbIn - xdIn;
+
+ /* (yb-yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ a0 = (Xaplusc + Xbplusd);
+ /* ya' = ya + yb + yc + yd */
+ a1 = (Yaplusc + Ybplusd);
+ /* xc' = (xa-xb+xc-xd) */
+ a2 = (Xaplusc - Xbplusd);
+ /* yc' = (ya-yb+yc-yd) */
+ a3 = (Yaplusc - Ybplusd);
+ /* xb' = (xa+yb-xc-yd) */
+ a4 = (Xaminusc + Ybminusd);
+ /* yb' = (ya-xb-yc+xd) */
+ a5 = (Yaminusc - Xbminusd);
+ /* xd' = (xa-yb-xc+yd)) */
+ a6 = (Xaminusc - Ybminusd);
+ /* yd' = (ya+xb-yc-xd) */
+ a7 = (Xbminusd + Yaminusc);
+
+ ptr1[0] = a0;
+ ptr1[1] = a1;
+ ptr1[2] = a2;
+ ptr1[3] = a3;
+ ptr1[4] = a4;
+ ptr1[5] = a5;
+ ptr1[6] = a6;
+ ptr1[7] = a7;
+
+ /* increment pointer by 8 */
+ ptr1 += 8U;
+ } while (--j);
+
+#else
+
+ float16_t t1, t2, r1, r2, s1, s2;
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initializations for the fft calculation */
+ n2 = fftLen;
+ n1 = n2;
+ for (k = fftLen; k > 1U; k >>= 2U)
+ {
+ /* Initializations for the fft calculation */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* FFT Calculation */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* xa + xc */
+ r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+ /* xa - xc */
+ r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = r1 + t1;
+
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = s1 + t2;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
+
+ /* (xa - xc) + (yb - yd) */
+ r1 = r2 + t1;
+
+ /* (xa - xc) - (yb - yd) */
+ r2 = r2 - t1;
+
+ /* (ya - yc) - (xb - xd) */
+ s1 = s2 - t2;
+
+ /* (ya - yc) + (xb - xd) */
+ s2 = s2 + t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
+
+ i0 += n1;
+ } while ( i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+/*
+* @brief Core function for the floating-point CIFFT butterfly process.
+* @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+* @param[in] fftLen length of the FFT.
+* @param[in] *pCoef points to twiddle coefficient buffer.
+* @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+* @param[in] onebyfftLen value of 1/fftLen.
+* @return none.
+*/
+
+void arm_radix4_butterfly_inverse_f16(
+float16_t * pSrc,
+uint16_t fftLen,
+const float16_t * pCoef,
+uint16_t twidCoefModifier,
+float16_t onebyfftLen)
+{
+ float16_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_DSP)
+
+ float16_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float16_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float16_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float16_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float16_t *ptr1;
+ float16_t p0,p1,p2,p3,p4,p5,p6,p7;
+ float16_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ /* Calculation of first stage */
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* xb - xd */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* yb - yd */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ /* (xa - xc) - (yb - yd) */
+ Xb12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc + Xbminusd);
+ /* (xa + xc) - (xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) + (yb - yd) */
+ Xd12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yd12C_out = (Yaminusc - Xbminusd);
+
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* index calculation for the coefficients */
+ ia3 = ia2 + ia1;
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out -= p0;
+ Yb12_out += p1;
+ Xc12_out -= p2;
+ Yc12_out += p3;
+ Xd12_out -= p4;
+ Yd12_out += p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of second stage to excluding last stage */
+ for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* (xb - xd) */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* (yb - yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xa - xc) - (yb - yd) */
+ Xb12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc + Xbminusd);
+ /* xa + xc -(xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) + (yb - yd) */
+ Xd12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yd12C_out = (Yaminusc - Xbminusd);
+
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out -= p0;
+ Yb12_out += p1;
+ Xc12_out -= p2;
+ Yc12_out += p3;
+ Xd12_out -= p4;
+ Yd12_out += p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ i0 += n1;
+ } while (i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+ /* Initializations of last stage */
+
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ xaIn = ptr1[0];
+ yaIn = ptr1[1];
+ xbIn = ptr1[2];
+ ybIn = ptr1[3];
+ xcIn = ptr1[4];
+ ycIn = ptr1[5];
+ xdIn = ptr1[6];
+ ydIn = ptr1[7];
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xb-xd) */
+ Xbminusd = xbIn - xdIn;
+
+ /* (yb-yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+ a0 = (Xaplusc + Xbplusd);
+ /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+ a1 = (Yaplusc + Ybplusd);
+ /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+ a2 = (Xaplusc - Xbplusd);
+ /* yc' = (ya-yb+yc-yd) * onebyfftLen */
+ a3 = (Yaplusc - Ybplusd);
+ /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+ a4 = (Xaminusc - Ybminusd);
+ /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+ a5 = (Yaminusc + Xbminusd);
+ /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+ a6 = (Xaminusc + Ybminusd);
+ /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+ a7 = (Yaminusc - Xbminusd);
+
+ p0 = a0 * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p2 = a2 * onebyfftLen;
+ p3 = a3 * onebyfftLen;
+ p4 = a4 * onebyfftLen;
+ p5 = a5 * onebyfftLen;
+ p6 = a6 * onebyfftLen;
+ p7 = a7 * onebyfftLen;
+
+ /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+ ptr1[0] = p0;
+ /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+ ptr1[1] = p1;
+ /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+ ptr1[2] = p2;
+ /* yc' = (ya-yb+yc-yd) * onebyfftLen */
+ ptr1[3] = p3;
+ /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+ ptr1[4] = p4;
+ /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+ ptr1[5] = p5;
+ /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+ ptr1[6] = p6;
+ /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+ ptr1[7] = p7;
+
+ /* increment source pointer by 8 for next calculations */
+ ptr1 = ptr1 + 8U;
+
+ } while (--j);
+
+#else
+
+ float16_t t1, t2, r1, r2, s1, s2;
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* Calculation of first stage */
+ for (k = fftLen; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* xa + xc */
+ r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+ /* xa - xc */
+ r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = r1 + t1;
+
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = s1 + t2;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (r1 * co2) - (s1 * si2);
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2);
+
+ /* (xa - xc) - (yb - yd) */
+ r1 = r2 - t1;
+
+ /* (xa - xc) + (yb - yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb - xd) */
+ s1 = s2 + t2;
+
+ /* (ya - yc) - (xb - xd) */
+ s2 = s2 - t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (r1 * co1) - (s1 * si1);
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1);
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (r2 * co3) - (s2 * si3);
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3);
+
+ i0 += n1;
+ } while ( i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+ /* Initializations of last stage */
+ n1 = n2;
+ n2 >>= 2U;
+
+ /* Calculations of last stage */
+ for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = pSrc[2U * i0] + pSrc[2U * i2];
+
+ /* xa - xc */
+ r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xc + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1) * onebyfftLen;
+
+ /* (xa + xb) - (xc + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb-yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb-xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = r1 * onebyfftLen;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = s1 * onebyfftLen;
+
+ /* (xa - xc) - (yb-yd) */
+ r1 = r2 - t1;
+
+ /* (xa - xc) + (yb-yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb-xd) */
+ s1 = s2 + t2;
+
+ /* (ya - yc) - (xb-xd) */
+ s2 = s2 - t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = r1 * onebyfftLen;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = s1 * onebyfftLen;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = r2 * onebyfftLen;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
+ }
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
new file mode 100644
index 0000000000..9d9d4d5638
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_f32.c
@@ -0,0 +1,1203 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_f32.c
+ * Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+extern void arm_bitreversal_f32(
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+void arm_radix4_butterfly_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen);
+
+
+
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+
+/**
+ @brief Processing function for the floating-point Radix-4 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in] S points to an instance of the floating-point Radix-4 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+ */
+
+void arm_cfft_radix4_f32(
+ const arm_cfft_radix4_instance_f32 * S,
+ float32_t * pSrc)
+{
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-4 */
+ arm_radix4_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier, S->onebyfftLen);
+ }
+ else
+ {
+ /* Complex FFT radix-4 */
+ arm_radix4_butterfly_f32(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * ---------------------------------------------------------------------- */
+
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to the twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ return none
+ */
+
+void arm_radix4_butterfly_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+ float32_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float32_t *ptr1;
+ float32_t p0,p1,p2,p3,p4,p5;
+ float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ /* Calculation of first stage */
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* xb - xd */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* yb - yd */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ /* (xa - xc) + (yb - yd) */
+ Xb12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc - Xbminusd);
+ /* (xa + xc) - (xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) - (yb - yd) */
+ Xd12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yd12C_out = (Xbminusd + Yaminusc);
+
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* index calculation for the coefficients */
+ ia3 = ia2 + ia1;
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out += p0;
+ Yb12_out -= p1;
+ Xc12_out += p2;
+ Yc12_out -= p3;
+ Xd12_out += p4;
+ Yd12_out -= p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ /* Twiddle coefficients index modifier */
+ ia1 += twidCoefModifier;
+
+ /* Updating input index */
+ i0++;
+
+ }
+ while (--j);
+
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of second stage to excluding last stage */
+ for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[(ia1 * 2U)];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[(ia2 * 2U)];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[(ia3 * 2U)];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 += twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* (xb - xd) */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* (yb - yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xa - xc) + (yb - yd) */
+ Xb12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yb12C_out = (Yaminusc - Xbminusd);
+ /* xa + xc -(xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) - (yb - yd) */
+ Xd12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yd12C_out = (Xbminusd + Yaminusc);
+
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out += p0;
+ Yb12_out -= p1;
+ Xc12_out += p2;
+ Yc12_out -= p3;
+ Xd12_out += p4;
+ Yd12_out -= p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ i0 += n1;
+ } while (i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ xaIn = ptr1[0];
+ yaIn = ptr1[1];
+ xbIn = ptr1[2];
+ ybIn = ptr1[3];
+ xcIn = ptr1[4];
+ ycIn = ptr1[5];
+ xdIn = ptr1[6];
+ ydIn = ptr1[7];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xb-xd) */
+ Xbminusd = xbIn - xdIn;
+
+ /* (yb-yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ a0 = (Xaplusc + Xbplusd);
+ /* ya' = ya + yb + yc + yd */
+ a1 = (Yaplusc + Ybplusd);
+ /* xc' = (xa-xb+xc-xd) */
+ a2 = (Xaplusc - Xbplusd);
+ /* yc' = (ya-yb+yc-yd) */
+ a3 = (Yaplusc - Ybplusd);
+ /* xb' = (xa+yb-xc-yd) */
+ a4 = (Xaminusc + Ybminusd);
+ /* yb' = (ya-xb-yc+xd) */
+ a5 = (Yaminusc - Xbminusd);
+ /* xd' = (xa-yb-xc+yd)) */
+ a6 = (Xaminusc - Ybminusd);
+ /* yd' = (ya+xb-yc-xd) */
+ a7 = (Xbminusd + Yaminusc);
+
+ ptr1[0] = a0;
+ ptr1[1] = a1;
+ ptr1[2] = a2;
+ ptr1[3] = a3;
+ ptr1[4] = a4;
+ ptr1[5] = a5;
+ ptr1[6] = a6;
+ ptr1[7] = a7;
+
+ /* increment pointer by 8 */
+ ptr1 += 8U;
+ } while (--j);
+
+#else
+
+ float32_t t1, t2, r1, r2, s1, s2;
+
+ /* Initializations for the fft calculation */
+ n2 = fftLen;
+ n1 = n2;
+ for (k = fftLen; k > 1U; k >>= 2U)
+ {
+ /* Initializations for the fft calculation */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* FFT Calculation */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* xa + xc */
+ r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+ /* xa - xc */
+ r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = r1 + t1;
+
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = s1 + t2;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (r1 * co2) + (s1 * si2);
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (s1 * co2) - (r1 * si2);
+
+ /* (xa - xc) + (yb - yd) */
+ r1 = r2 + t1;
+
+ /* (xa - xc) - (yb - yd) */
+ r2 = r2 - t1;
+
+ /* (ya - yc) - (xb - xd) */
+ s1 = s2 - t2;
+
+ /* (ya - yc) + (xb - xd) */
+ s2 = s2 + t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (r1 * co1) + (s1 * si1);
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (s1 * co1) - (r1 * si1);
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (r2 * co3) + (s2 * si3);
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (s2 * co3) - (r2 * si3);
+
+ i0 += n1;
+ } while ( i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+ brief Core function for the floating-point CIFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type
+ param[in] fftLen length of the FFT
+ param[in] pCoef points to twiddle coefficient buffer
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ param[in] onebyfftLen value of 1/fftLen
+ return none
+ */
+
+void arm_radix4_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen)
+{
+ float32_t co1, co2, co3, si1, si2, si3;
+ uint32_t ia1, ia2, ia3;
+ uint32_t i0, i1, i2, i3;
+ uint32_t n1, n2, j, k;
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ float32_t xaIn, yaIn, xbIn, ybIn, xcIn, ycIn, xdIn, ydIn;
+ float32_t Xaplusc, Xbplusd, Yaplusc, Ybplusd, Xaminusc, Xbminusd, Yaminusc,
+ Ybminusd;
+ float32_t Xb12C_out, Yb12C_out, Xc12C_out, Yc12C_out, Xd12C_out, Yd12C_out;
+ float32_t Xb12_out, Yb12_out, Xc12_out, Yc12_out, Xd12_out, Yd12_out;
+ float32_t *ptr1;
+ float32_t p0,p1,p2,p3,p4,p5,p6,p7;
+ float32_t a0,a1,a2,a3,a4,a5,a6,a7;
+
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ /* Calculation of first stage */
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* xb - xd */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* yb - yd */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ /* (xa - xc) - (yb - yd) */
+ Xb12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc + Xbminusd);
+ /* (xa + xc) - (xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) + (yb - yd) */
+ Xd12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yd12C_out = (Yaminusc - Xbminusd);
+
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* index calculation for the coefficients */
+ ia3 = ia2 + ia1;
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out -= p0;
+ Yb12_out += p1;
+ Xc12_out -= p2;
+ Yc12_out += p3;
+ Xd12_out -= p4;
+ Yd12_out += p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of second stage to excluding last stage */
+ for (k = fftLen >> 2U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ xaIn = pSrc[(2U * i0)];
+ yaIn = pSrc[(2U * i0) + 1U];
+
+ xbIn = pSrc[(2U * i1)];
+ ybIn = pSrc[(2U * i1) + 1U];
+
+ xcIn = pSrc[(2U * i2)];
+ ycIn = pSrc[(2U * i2) + 1U];
+
+ xdIn = pSrc[(2U * i3)];
+ ydIn = pSrc[(2U * i3) + 1U];
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+ /* (xb - xd) */
+ Xbminusd = xbIn - xdIn;
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+ /* (yb - yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xa - xc) - (yb - yd) */
+ Xb12C_out = (Xaminusc - Ybminusd);
+ /* (ya - yc) + (xb - xd) */
+ Yb12C_out = (Yaminusc + Xbminusd);
+ /* xa + xc -(xb + xd) */
+ Xc12C_out = (Xaplusc - Xbplusd);
+ /* (ya + yc) - (yb + yd) */
+ Yc12C_out = (Yaplusc - Ybplusd);
+ /* (xa - xc) + (yb - yd) */
+ Xd12C_out = (Xaminusc + Ybminusd);
+ /* (ya - yc) - (xb - xd) */
+ Yd12C_out = (Yaminusc - Xbminusd);
+
+ pSrc[(2U * i0)] = Xaplusc + Xbplusd;
+ pSrc[(2U * i0) + 1U] = Yaplusc + Ybplusd;
+
+ Xb12_out = Xb12C_out * co1;
+ Yb12_out = Yb12C_out * co1;
+ Xc12_out = Xc12C_out * co2;
+ Yc12_out = Yc12C_out * co2;
+ Xd12_out = Xd12C_out * co3;
+ Yd12_out = Yd12C_out * co3;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ //Xb12_out -= Yb12C_out * si1;
+ p0 = Yb12C_out * si1;
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ //Yb12_out += Xb12C_out * si1;
+ p1 = Xb12C_out * si1;
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ //Xc12_out -= Yc12C_out * si2;
+ p2 = Yc12C_out * si2;
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ //Yc12_out += Xc12C_out * si2;
+ p3 = Xc12C_out * si2;
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ //Xd12_out -= Yd12C_out * si3;
+ p4 = Yd12C_out * si3;
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ //Yd12_out += Xd12C_out * si3;
+ p5 = Xd12C_out * si3;
+
+ Xb12_out -= p0;
+ Yb12_out += p1;
+ Xc12_out -= p2;
+ Yc12_out += p3;
+ Xd12_out -= p4;
+ Yd12_out += p5;
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = Xc12_out;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = Yc12_out;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = Xb12_out;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = Yb12_out;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = Xd12_out;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = Yd12_out;
+
+ i0 += n1;
+ } while (i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+ /* Initializations of last stage */
+
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ xaIn = ptr1[0];
+ yaIn = ptr1[1];
+ xbIn = ptr1[2];
+ ybIn = ptr1[3];
+ xcIn = ptr1[4];
+ ycIn = ptr1[5];
+ xdIn = ptr1[6];
+ ydIn = ptr1[7];
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ Xaplusc = xaIn + xcIn;
+
+ /* xa - xc */
+ Xaminusc = xaIn - xcIn;
+
+ /* ya + yc */
+ Yaplusc = yaIn + ycIn;
+
+ /* ya - yc */
+ Yaminusc = yaIn - ycIn;
+
+ /* xb + xd */
+ Xbplusd = xbIn + xdIn;
+
+ /* yb + yd */
+ Ybplusd = ybIn + ydIn;
+
+ /* (xb-xd) */
+ Xbminusd = xbIn - xdIn;
+
+ /* (yb-yd) */
+ Ybminusd = ybIn - ydIn;
+
+ /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+ a0 = (Xaplusc + Xbplusd);
+ /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+ a1 = (Yaplusc + Ybplusd);
+ /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+ a2 = (Xaplusc - Xbplusd);
+ /* yc' = (ya-yb+yc-yd) * onebyfftLen */
+ a3 = (Yaplusc - Ybplusd);
+ /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+ a4 = (Xaminusc - Ybminusd);
+ /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+ a5 = (Yaminusc + Xbminusd);
+ /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+ a6 = (Xaminusc + Ybminusd);
+ /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+ a7 = (Yaminusc - Xbminusd);
+
+ p0 = a0 * onebyfftLen;
+ p1 = a1 * onebyfftLen;
+ p2 = a2 * onebyfftLen;
+ p3 = a3 * onebyfftLen;
+ p4 = a4 * onebyfftLen;
+ p5 = a5 * onebyfftLen;
+ p6 = a6 * onebyfftLen;
+ p7 = a7 * onebyfftLen;
+
+ /* xa' = (xa+xb+xc+xd) * onebyfftLen */
+ ptr1[0] = p0;
+ /* ya' = (ya+yb+yc+yd) * onebyfftLen */
+ ptr1[1] = p1;
+ /* xc' = (xa-xb+xc-xd) * onebyfftLen */
+ ptr1[2] = p2;
+ /* yc' = (ya-yb+yc-yd) * onebyfftLen */
+ ptr1[3] = p3;
+ /* xb' = (xa-yb-xc+yd) * onebyfftLen */
+ ptr1[4] = p4;
+ /* yb' = (ya+xb-yc-xd) * onebyfftLen */
+ ptr1[5] = p5;
+ /* xd' = (xa-yb-xc+yd) * onebyfftLen */
+ ptr1[6] = p6;
+ /* yd' = (ya-xb-yc+xd) * onebyfftLen */
+ ptr1[7] = p7;
+
+ /* increment source pointer by 8 for next calculations */
+ ptr1 = ptr1 + 8U;
+
+ } while (--j);
+
+#else
+
+ float32_t t1, t2, r1, r2, s1, s2;
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* Calculation of first stage */
+ for (k = fftLen; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ j = 0;
+ do
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ i0 = j;
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* xa + xc */
+ r1 = pSrc[(2U * i0)] + pSrc[(2U * i2)];
+
+ /* xa - xc */
+ r2 = pSrc[(2U * i0)] - pSrc[(2U * i2)];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = r1 + t1;
+
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = s1 + t2;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (r1 * co2) - (s1 * si2);
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (s1 * co2) + (r1 * si2);
+
+ /* (xa - xc) - (yb - yd) */
+ r1 = r2 - t1;
+
+ /* (xa - xc) + (yb - yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb - xd) */
+ s1 = s2 + t2;
+
+ /* (ya - yc) - (xb - xd) */
+ s2 = s2 - t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (r1 * co1) - (s1 * si1);
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (s1 * co1) + (r1 * si1);
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (r2 * co3) - (s2 * si3);
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (s2 * co3) + (r2 * si3);
+
+ i0 += n1;
+ } while ( i0 < fftLen);
+ j++;
+ } while (j <= (n2 - 1U));
+ twidCoefModifier <<= 2U;
+ }
+ /* Initializations of last stage */
+ n1 = n2;
+ n2 >>= 2U;
+
+ /* Calculations of last stage */
+ for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = pSrc[2U * i0] + pSrc[2U * i2];
+
+ /* xa - xc */
+ r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xc + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1) * onebyfftLen;
+
+ /* (xa + xb) - (xc + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2) * onebyfftLen;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb-yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+
+ /* (xb-xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = r1 * onebyfftLen;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = s1 * onebyfftLen;
+
+ /* (xa - xc) - (yb-yd) */
+ r1 = r2 - t1;
+
+ /* (xa - xc) + (yb-yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb-xd) */
+ s1 = s2 + t2;
+
+ /* (ya - yc) - (xb-xd) */
+ s2 = s2 - t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = r1 * onebyfftLen;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = s1 * onebyfftLen;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = r2 * onebyfftLen;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = s2 * onebyfftLen;
+ }
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+}
+
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c
new file mode 100644
index 0000000000..cb45cd8075
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f16.c
@@ -0,0 +1,171 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_init_f16.c
+ * Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables.h"
+#include "arm_common_tables_f16.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f16 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+arm_status arm_cfft_radix4_init_f16(
+ arm_cfft_radix4_instance_f16 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F16_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (float16_t *) twiddleCoefF16;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.000244140625;
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.0009765625f;
+ break;
+
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ S->onebyfftLen = 0.00390625f;
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ S->onebyfftLen = 0.015625f;
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ S->onebyfftLen = 0.0625f;
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
new file mode 100644
index 0000000000..ec05fc5780
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_f32.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_init_f32.c
+ * Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superceded by \ref arm_cfft_f32 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_cfft_radix4_init_f32(
+ arm_cfft_radix4_instance_f32 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (float32_t *) twiddleCoef;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_F32_4096)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.000244140625;
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.0009765625f;
+ break;
+
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ S->onebyfftLen = 0.00390625f;
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ S->onebyfftLen = 0.015625f;
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ S->onebyfftLen = 0.0625f;
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+#endif
+#endif
+#endif
+
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
new file mode 100644
index 0000000000..7d910b88b0
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q15.c
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_init_q15.c
+ * Description: Radix-4 Decimation in Frequency Q15 FFT & IFFT initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+
+/**
+ @brief Initialization function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q15 CFFT/CIFFT structure
+ @param[in] fftLen length of the FFT
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_cfft_radix4_init_q15(
+ arm_cfft_radix4_instance_q15 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q15_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q15_t *) twiddleCoef_4096_q15;
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+ case 4096U:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+
+ break;
+
+ case 1024U:
+ /* Initializations of structure parameters for 1024 point FFT */
+ S->twidCoefModifier = 4U;
+ S->bitRevFactor = 4U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+
+ break;
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
new file mode 100644
index 0000000000..04ba393ad3
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_init_q31.c
@@ -0,0 +1,154 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_init_q31.c
+ * Description: Radix-4 Decimation in Frequency Q31 FFT & IFFT initialization function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+
+ @brief Initialization function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in,out] S points to an instance of the Q31 CFFT/CIFFT structure.
+ @param[in] fftLen length of the FFT.
+ @param[in] ifftFlag flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Details
+ The parameter ifftFlag controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix4_init_q31(
+ arm_cfft_radix4_instance_q31 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_TWIDDLECOEF_Q31_4096)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q31_t *) twiddleCoef_4096_q31;
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_BITREV_1024)
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen)
+ {
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ break;
+
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024U:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4U;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4U;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ break;
+
+ case 256U:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16U;
+ S->bitRevFactor = 16U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ break;
+
+ case 64U:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64U;
+ S->bitRevFactor = 64U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ break;
+
+ case 16U:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256U;
+ S->bitRevFactor = 256U;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ break;
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+#endif
+ return (status);
+}
+
+/**
+ @} end of ComplexFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
new file mode 100644
index 0000000000..a467735479
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q15.c
@@ -0,0 +1,1809 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_q15.c
+ * Description: This file has function definition of Radix-4 FFT & IFFT function and
+ * In-place bit reversal using bit reversal table
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+
+void arm_radix4_butterfly_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier);
+
+void arm_radix4_butterfly_inverse_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier);
+
+void arm_bitreversal_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+
+/**
+ @brief Processing function for the Q15 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q15 and will be removed in the future.
+ @param[in] S points to an instance of the Q15 CFFT/CIFFT structure.
+ @param[in,out] pSrc points to the complex data buffer. Processing occurs in-place.
+ @return none
+
+ @par Input and output formats:
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different FFT sizes.
+ The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+ @par
+ \image html CFFTQ15.gif "Input and Output Formats for Q15 CFFT"
+ \image html CIFFTQ15.gif "Input and Output Formats for Q15 CIFFT"
+ */
+
+void arm_cfft_radix4_q15(
+ const arm_cfft_radix4_instance_q15 * S,
+ q15_t * pSrc)
+{
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-4 */
+ arm_radix4_butterfly_inverse_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ /* Complex FFT radix-4 */
+ arm_radix4_butterfly_q15(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+/*
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
+
+/**
+ @brief Core function for the Q15 CFFT butterfly process.
+ @param[in,out] pSrc16 points to the in-place buffer of Q15 data type
+ @param[in] fftLen length of the FFT
+ @param[in] pCoef16 points to twiddle coefficient buffer
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
+void arm_radix4_butterfly_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier)
+{
+
+#if defined (ARM_MATH_DSP)
+
+ q31_t R, S, T, U;
+ q31_t C1, C2, C3, out1, out2;
+ uint32_t n1, n2, ic, i0, j, k;
+
+ q15_t *ptr1;
+ q15_t *pSi0;
+ q15_t *pSi1;
+ q15_t *pSi2;
+ q15_t *pSi3;
+
+ q31_t xaya, xbyb, xcyc, xdyd;
+
+ /* Total process is divided into three stages */
+
+ /* process first stage, middle stages, & last stage */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+
+ /* Index for twiddle coefficient */
+ ic = 0U;
+
+ /* Index for input read and output write */
+ j = n2;
+
+ pSi0 = pSrc16;
+ pSi1 = pSi0 + 2 * n2;
+ pSi2 = pSi1 + 2 * n2;
+ pSi3 = pSi2 + 2 * n2;
+
+ /* Input is in 1.15(q15) format */
+
+ /* start of first stage process */
+ do
+ {
+ /* Butterfly implementation */
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T = read_q15x2 (pSi0);
+ T = __SHADD16(T, 0); /* this is just a SIMD arithmetic shift right by 1 */
+ T = __SHADD16(T, 0); /* it turns out doing this twice is 2 cycles, the alternative takes 3 cycles */
+/*
+ in = ((int16_t) (T & 0xFFFF)) >> 2; // alternative code that takes 3 cycles
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+*/
+
+ /* Read yc (real), xc(imag) input */
+ S = read_q15x2 (pSi2);
+ S = __SHADD16(S, 0);
+ S = __SHADD16(S, 0);
+
+ /* R = packed((ya + yc), (xa + xc) ) */
+ R = __QADD16(T, S);
+
+ /* S = packed((ya - yc), (xa - xc) ) */
+ S = __QSUB16(T, S);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+ T = __SHADD16(T, 0);
+ T = __SHADD16(T, 0);
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+ U = __SHADD16(U, 0);
+ U = __SHADD16(U, 0);
+
+ /* T = packed((yb + yd), (xb + xd) ) */
+ T = __QADD16(T, U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ write_q15x2_ia (&pSi0, __SHADD16(R, T));
+
+ /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
+ R = __QSUB16(R, T);
+
+ /* co2 & si2 are read from SIMD Coefficient pointer */
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ out1 = __SMUAD(C2, R) >> 16U;
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = __SMUSDX(C2, R);
+#else
+ /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out1 = __SMUSDX(R, C2) >> 16U;
+ /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ out2 = __SMUAD(C2, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Reading i0+fftLen/4 */
+ /* T = packed(yb, xb) */
+ T = read_q15x2 (pSi1);
+ T = __SHADD16(T, 0);
+ T = __SHADD16(T, 0);
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* writing output(xc', yc') in little endian format */
+ write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
+
+ /* Butterfly calculations */
+ /* U = packed(yd, xd) */
+ U = read_q15x2 (pSi3);
+ U = __SHADD16(U, 0);
+ U = __SHADD16(U, 0);
+
+ /* T = packed(yb-yd, xb-xd) */
+ T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __QASX(S, T);
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __QSAX(S, T);
+#else
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __QSAX(S, T);
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __QASX(S, T);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* co1 & si1 are read from SIMD Coefficient pointer */
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ /* Butterfly process for the i0+fftLen/2 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ out1 = __SMUAD(C1, S) >> 16U;
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ out2 = __SMUSDX(C1, S);
+#else
+ /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ out1 = __SMUSDX(S, C1) >> 16U;
+ /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ out2 = __SMUAD(C1, S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* writing output(xb', yb') in little endian format */
+ write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
+
+ /* co3 & si3 are read from SIMD Coefficient pointer */
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+ /* Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ out1 = __SMUAD(C3, R) >> 16U;
+ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ out2 = __SMUSDX(C3, R);
+#else
+ /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ out1 = __SMUSDX(R, C3) >> 16U;
+ /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ out2 = __SMUAD(C3, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* writing output(xd', yd') in little endian format */
+ write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ } while (--j);
+ /* data is in 4.11(q11) format */
+
+ /* end of first stage process */
+
+
+ /* start of middle stage process */
+
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of Middle stage */
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the middle stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ic = 0U;
+
+ for (j = 0U; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ pSi0 = pSrc16 + 2 * j;
+ pSi1 = pSi0 + 2 * n2;
+ pSi2 = pSi1 + 2 * n2;
+ pSi3 = pSi2 + 2 * n2;
+
+ /* Butterfly implementation */
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T = read_q15x2 (pSi0);
+
+ /* Read yc (real), xc(imag) input */
+ S = read_q15x2 (pSi2);
+
+ /* R = packed( (ya + yc), (xa + xc)) */
+ R = __QADD16(T, S);
+
+ /* S = packed((ya - yc), (xa - xc)) */
+ S = __QSUB16(T, S);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+
+ /* T = packed( (yb + yd), (xb + xd)) */
+ T = __QADD16(T, U);
+
+ /* writing the butterfly processed i0 sample */
+
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ out1 = __SHADD16(R, T);
+ out1 = __SHADD16(out1, 0);
+ write_q15x2 (pSi0, out1);
+ pSi0 += 2 * n1;
+
+ /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
+ R = __SHSUB16(R, T);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+ out1 = __SMUAD(C2, R) >> 16U;
+
+ /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = __SMUSDX(C2, R);
+#else
+ /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out1 = __SMUSDX(R, C2) >> 16U;
+
+ /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+ out2 = __SMUAD(C2, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Reading i0+3fftLen/4 */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
+ pSi1 += 2 * n1;
+
+ /* Butterfly calculations */
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+
+ /* T = packed(yb-yd, xb-xd) */
+ T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __SHASX(S, T);
+
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __SHSAX(S, T);
+
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = __SMUAD(C1, S) >> 16U;
+ out2 = __SMUSDX(C1, S);
+#else
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __SHSAX(S, T);
+
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __SHASX(S, T);
+
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = __SMUSDX(S, C1) >> 16U;
+ out2 = __SMUAD(C1, S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
+ pSi2 += 2 * n1;
+
+ /* Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUAD(C3, R) >> 16U;
+ out2 = __SMUSDX(C3, R);
+#else
+ out1 = __SMUSDX(R, C3) >> 16U;
+ out2 = __SMUAD(C3, R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
+ pSi3 += 2 * n1;
+ }
+ }
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+ }
+ /* end of middle stage process */
+
+
+ /* data is in 10.6(q6) format for the 1024 point */
+ /* data is in 8.8(q8) format for the 256 point */
+ /* data is in 6.10(q10) format for the 64 point */
+ /* data is in 4.12(q12) format for the 16 point */
+
+ /* Initializations for the last stage */
+ j = fftLen >> 2;
+
+ ptr1 = &pSrc16[0];
+
+ /* start of last stage process */
+
+ /* Butterfly implementation */
+ do
+ {
+ /* Read xa (real), ya(imag) input */
+ xaya = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xb (real), yb(imag) input */
+ xbyb = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xc (real), yc(imag) input */
+ xcyc = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xd (real), yd(imag) input */
+ xdyd = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* R = packed((ya + yc), (xa + xc)) */
+ R = __QADD16(xaya, xcyc);
+
+ /* T = packed((yb + yd), (xb + xd)) */
+ T = __QADD16(xbyb, xdyd);
+
+ /* pointer updation for writing */
+ ptr1 = ptr1 - 8U;
+
+
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ write_q15x2_ia (&ptr1, __SHADD16(R, T));
+
+ /* T = packed((yb + yd), (xb + xd)) */
+ T = __QADD16(xbyb, xdyd);
+
+ /* xc' = (xa-xb+xc-xd) */
+ /* yc' = (ya-yb+yc-yd) */
+ write_q15x2_ia (&ptr1, __SHSUB16(R, T));
+
+ /* S = packed((ya - yc), (xa - xc)) */
+ S = __QSUB16(xaya, xcyc);
+
+ /* Read yd (real), xd(imag) input */
+ /* T = packed( (yb - yd), (xb - xd)) */
+ U = __QSUB16(xbyb, xdyd);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xb' = (xa+yb-xc-yd) */
+ /* yb' = (ya-xb-yc+xd) */
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
+
+ /* xd' = (xa-yb-xc+yd) */
+ /* yd' = (ya+xb-yc-xd) */
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
+#else
+ /* xb' = (xa+yb-xc-yd) */
+ /* yb' = (ya-xb-yc+xd) */
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
+
+ /* xd' = (xa-yb-xc+yd) */
+ /* yd' = (ya+xb-yc-xd) */
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ } while (--j);
+
+ /* end of last stage process */
+
+ /* output is in 11.5(q5) format for the 1024 point */
+ /* output is in 9.7(q7) format for the 256 point */
+ /* output is in 7.9(q9) format for the 64 point */
+ /* output is in 5.11(q11) format for the 16 point */
+
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+ q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+ uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+
+ /* Total process is divided into three stages */
+
+ /* process first stage, middle stages, & last stage */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+
+ /* Index for twiddle coefficient */
+ ic = 0U;
+
+ /* Index for input read and output write */
+ i0 = 0U;
+ j = n2;
+
+ /* Input is in 1.15(q15) format */
+
+ /* start of first stage process */
+ do
+ {
+ /* Butterfly implementation */
+
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+
+ /* input is down scale by 4 to avoid overflow */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U] >> 2U;
+ T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
+
+ /* input is down scale by 4 to avoid overflow */
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U] >> 2U;
+ S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
+
+ /* R0 = (ya + yc) */
+ R0 = __SSAT(T0 + S0, 16U);
+ /* R1 = (xa + xc) */
+ R1 = __SSAT(T1 + S1, 16U);
+
+ /* S0 = (ya - yc) */
+ S0 = __SSAT(T0 - S0, 16);
+ /* S1 = (xa - xc) */
+ S1 = __SSAT(T1 - S1, 16);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* input is down scale by 4 to avoid overflow */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U] >> 2U;
+ T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+
+ /* input is down scale by 4 to avoid overflow */
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U] >> 2U;
+ U1 = pSrc16[(i3 * 2U) + 1] >> 2U;
+
+ /* T0 = (yb + yd) */
+ T0 = __SSAT(T0 + U0, 16U);
+ /* T1 = (xb + xd) */
+ T1 = __SSAT(T1 + U1, 16U);
+
+ /* writing the butterfly processed i0 sample */
+ /* ya' = ya + yb + yc + yd */
+ /* xa' = xa + xb + xc + xd */
+ pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+ pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+ /* R0 = (ya + yc) - (yb + yd) */
+ /* R1 = (xa + xc) - (xb + xd) */
+ R0 = __SSAT(R0 - T0, 16U);
+ R1 = __SSAT(R1 - T1, 16U);
+
+ /* co2 & si2 are read from Coefficient pointer */
+ Co2 = pCoef16[2U * ic * 2U];
+ Si2 = pCoef16[(2U * ic * 2U) + 1];
+
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
+
+ /* Reading i0+fftLen/4 */
+ /* input is down scale by 4 to avoid overflow */
+ /* T0 = yb, T1 = xb */
+ T0 = pSrc16[i1 * 2U] >> 2;
+ T1 = pSrc16[(i1 * 2U) + 1] >> 2;
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* writing output(xc', yc') in little endian format */
+ pSrc16[i1 * 2U] = out1;
+ pSrc16[(i1 * 2U) + 1] = out2;
+
+ /* Butterfly calculations */
+ /* input is down scale by 4 to avoid overflow */
+ /* U0 = yd, U1 = xd */
+ U0 = pSrc16[i3 * 2U] >> 2;
+ U1 = pSrc16[(i3 * 2U) + 1] >> 2;
+ /* T0 = yb-yd */
+ T0 = __SSAT(T0 - U0, 16);
+ /* T1 = xb-xd */
+ T1 = __SSAT(T1 - U1, 16);
+
+ /* R1 = (ya-yc) + (xb- xd), R0 = (xa-xc) - (yb-yd)) */
+ R0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+ R1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
+
+ /* S1 = (ya-yc) - (xb- xd), S0 = (xa-xc) + (yb-yd)) */
+ S0 = (q15_t) __SSAT(((q31_t) S0 + T1), 16U);
+ S1 = (q15_t) __SSAT(((q31_t) S1 - T0), 16U);
+
+ /* co1 & si1 are read from Coefficient pointer */
+ Co1 = pCoef16[ic * 2U];
+ Si1 = pCoef16[(ic * 2U) + 1];
+ /* Butterfly process for the i0+fftLen/2 sample */
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ out1 = (q15_t) ((Si1 * S1 + Co1 * S0) >> 16);
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16);
+
+ /* writing output(xb', yb') in little endian format */
+ pSrc16[i2 * 2U] = out1;
+ pSrc16[(i2 * 2U) + 1] = out2;
+
+ /* Co3 & si3 are read from Coefficient pointer */
+ Co3 = pCoef16[3U * (ic * 2U)];
+ Si3 = pCoef16[(3U * (ic * 2U)) + 1];
+ /* Butterfly process for the i0+3fftLen/4 sample */
+ /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+ out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
+ /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+ out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
+ /* writing output(xd', yd') in little endian format */
+ pSrc16[i3 * 2U] = out1;
+ pSrc16[(i3 * 2U) + 1] = out2;
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+ /* data is in 4.11(q11) format */
+
+ /* end of first stage process */
+
+
+ /* start of middle stage process */
+
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of Middle stage */
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the middle stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ic = 0U;
+
+ for (j = 0U; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ Co1 = pCoef16[ic * 2U];
+ Si1 = pCoef16[(ic * 2U) + 1U];
+ Co2 = pCoef16[2U * (ic * 2U)];
+ Si2 = pCoef16[(2U * (ic * 2U)) + 1U];
+ Co3 = pCoef16[3U * (ic * 2U)];
+ Si3 = pCoef16[(3U * (ic * 2U)) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ /* Butterfly implementation */
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U];
+ T1 = pSrc16[(i0 * 2U) + 1U];
+
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U];
+ S1 = pSrc16[(i2 * 2U) + 1U];
+
+ /* R0 = (ya + yc), R1 = (xa + xc) */
+ R0 = __SSAT(T0 + S0, 16);
+ R1 = __SSAT(T1 + S1, 16);
+
+ /* S0 = (ya - yc), S1 =(xa - xc) */
+ S0 = __SSAT(T0 - S0, 16);
+ S1 = __SSAT(T1 - S1, 16);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+
+ /* T0 = (yb + yd), T1 = (xb + xd) */
+ T0 = __SSAT(T0 + U0, 16);
+ T1 = __SSAT(T1 + U1, 16);
+
+ /* writing the butterfly processed i0 sample */
+
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ out1 = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
+ out2 = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
+
+ pSrc16[i0 * 2U] = out1;
+ pSrc16[(2U * i0) + 1U] = out2;
+
+ /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+ R0 = (R0 >> 1U) - (T0 >> 1U);
+ R1 = (R1 >> 1U) - (T1 >> 1U);
+
+ /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+ out1 = (q15_t) ((Co2 * R0 + Si2 * R1) >> 16U);
+
+ /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = (q15_t) ((-Si2 * R0 + Co2 * R1) >> 16U);
+
+ /* Reading i0+3fftLen/4 */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ pSrc16[i1 * 2U] = out1;
+ pSrc16[(i1 * 2U) + 1U] = out2;
+
+ /* Butterfly calculations */
+
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+ /* T0 = yb-yd, T1 = xb-xd */
+ T0 = __SSAT(T0 - U0, 16);
+ T1 = __SSAT(T1 - U1, 16);
+
+ /* R0 = (ya-yc) + (xb- xd), R1 = (xa-xc) - (yb-yd)) */
+ R0 = (S0 >> 1U) - (T1 >> 1U);
+ R1 = (S1 >> 1U) + (T0 >> 1U);
+
+ /* S0 = (ya-yc) - (xb- xd), S1 = (xa-xc) + (yb-yd)) */
+ S0 = (S0 >> 1U) + (T1 >> 1U);
+ S1 = (S1 >> 1U) - (T0 >> 1U);
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = (q15_t) ((Co1 * S0 + Si1 * S1) >> 16U);
+
+ out2 = (q15_t) ((-Si1 * S0 + Co1 * S1) >> 16U);
+
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ pSrc16[i2 * 2U] = out1;
+ pSrc16[(i2 * 2U) + 1U] = out2;
+
+ /* Butterfly process for the i0+3fftLen/4 sample */
+ out1 = (q15_t) ((Si3 * R1 + Co3 * R0) >> 16U);
+
+ out2 = (q15_t) ((-Si3 * R0 + Co3 * R1) >> 16U);
+ /* xd' = (xa-yb-xc+yd)* Co3 + (ya+xb-yc-xd)* (si3) */
+ /* yd' = (ya+xb-yc-xd)* Co3 - (xa-yb-xc+yd)* (si3) */
+ pSrc16[i3 * 2U] = out1;
+ pSrc16[(i3 * 2U) + 1U] = out2;
+ }
+ }
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+ }
+ /* end of middle stage process */
+
+
+ /* data is in 10.6(q6) format for the 1024 point */
+ /* data is in 8.8(q8) format for the 256 point */
+ /* data is in 6.10(q10) format for the 64 point */
+ /* data is in 4.12(q12) format for the 16 point */
+
+ /* Initializations for the last stage */
+ n1 = n2;
+ n2 >>= 2U;
+
+ /* start of last stage process */
+
+ /* Butterfly implementation */
+ for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U];
+ T1 = pSrc16[(i0 * 2U) + 1U];
+
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U];
+ S1 = pSrc16[(i2 * 2U) + 1U];
+
+ /* R0 = (ya + yc), R1 = (xa + xc) */
+ R0 = __SSAT(T0 + S0, 16U);
+ R1 = __SSAT(T1 + S1, 16U);
+
+ /* S0 = (ya - yc), S1 = (xa - xc) */
+ S0 = __SSAT(T0 - S0, 16U);
+ S1 = __SSAT(T1 - S1, 16U);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+ /* T0 = (yb + yd), T1 = (xb + xd)) */
+ T0 = __SSAT(T0 + U0, 16U);
+ T1 = __SSAT(T1 + U1, 16U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+ pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+ /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+ R0 = (R0 >> 1U) - (T0 >> 1U);
+ R1 = (R1 >> 1U) - (T1 >> 1U);
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd) */
+ /* yc' = (ya-yb+yc-yd) */
+ pSrc16[i1 * 2U] = R0;
+ pSrc16[(i1 * 2U) + 1U] = R1;
+
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+ /* T0 = (yb - yd), T1 = (xb - xd) */
+ T0 = __SSAT(T0 - U0, 16U);
+ T1 = __SSAT(T1 - U1, 16U);
+
+ /* writing the butterfly processed i0 + fftLen/2 sample */
+ /* xb' = (xa+yb-xc-yd) */
+ /* yb' = (ya-xb-yc+xd) */
+ pSrc16[i2 * 2U] = (S0 >> 1U) + (T1 >> 1U);
+ pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
+
+ /* writing the butterfly processed i0 + 3fftLen/4 sample */
+ /* xd' = (xa-yb-xc+yd) */
+ /* yd' = (ya+xb-yc-xd) */
+ pSrc16[i3 * 2U] = (S0 >> 1U) - (T1 >> 1U);
+ pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
+
+ }
+
+ /* end of last stage process */
+
+ /* output is in 11.5(q5) format for the 1024 point */
+ /* output is in 9.7(q7) format for the 256 point */
+ /* output is in 7.9(q9) format for the 64 point */
+ /* output is in 5.11(q11) format for the 16 point */
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
+
+
+/**
+ @brief Core function for the Q15 CIFFT butterfly process.
+ @param[in,out] pSrc16 points to the in-place buffer of Q15 data type
+ @param[in] fftLen length of the FFT
+ @param[in] pCoef16 points to twiddle coefficient buffer
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
+ */
+
+/*
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT function
+ * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
+
+void arm_radix4_butterfly_inverse_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ const q15_t * pCoef16,
+ uint32_t twidCoefModifier)
+{
+
+#if defined (ARM_MATH_DSP)
+
+ q31_t R, S, T, U;
+ q31_t C1, C2, C3, out1, out2;
+ uint32_t n1, n2, ic, i0, j, k;
+
+ q15_t *ptr1;
+ q15_t *pSi0;
+ q15_t *pSi1;
+ q15_t *pSi2;
+ q15_t *pSi3;
+
+ q31_t xaya, xbyb, xcyc, xdyd;
+
+ /* Total process is divided into three stages */
+
+ /* process first stage, middle stages, & last stage */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+
+ /* Index for twiddle coefficient */
+ ic = 0U;
+
+ /* Index for input read and output write */
+ j = n2;
+
+ pSi0 = pSrc16;
+ pSi1 = pSi0 + 2 * n2;
+ pSi2 = pSi1 + 2 * n2;
+ pSi3 = pSi2 + 2 * n2;
+
+ /* Input is in 1.15(q15) format */
+
+ /* start of first stage process */
+ do
+ {
+ /* Butterfly implementation */
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T = read_q15x2 (pSi0);
+ T = __SHADD16(T, 0);
+ T = __SHADD16(T, 0);
+
+ /* Read yc (real), xc(imag) input */
+ S = read_q15x2 (pSi2);
+ S = __SHADD16(S, 0);
+ S = __SHADD16(S, 0);
+
+ /* R = packed((ya + yc), (xa + xc) ) */
+ R = __QADD16(T, S);
+
+ /* S = packed((ya - yc), (xa - xc) ) */
+ S = __QSUB16(T, S);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+ T = __SHADD16(T, 0);
+ T = __SHADD16(T, 0);
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+ U = __SHADD16(U, 0);
+ U = __SHADD16(U, 0);
+
+ /* T = packed((yb + yd), (xb + xd) ) */
+ T = __QADD16(T, U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ write_q15x2_ia (&pSi0, __SHADD16(R, T));
+
+ /* R = packed((ya + yc) - (yb + yd), (xa + xc)- (xb + xd)) */
+ R = __QSUB16(R, T);
+
+ /* co2 & si2 are read from SIMD Coefficient pointer */
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ out1 = __SMUSD(C2, R) >> 16U;
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = __SMUADX(C2, R);
+#else
+ /* xc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out1 = __SMUADX(C2, R) >> 16U;
+ /* yc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ out2 = __SMUSD(__QSUB16(0, C2), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Reading i0+fftLen/4 */
+ /* T = packed(yb, xb) */
+ T = read_q15x2 (pSi1);
+ T = __SHADD16(T, 0);
+ T = __SHADD16(T, 0);
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* writing output(xc', yc') in little endian format */
+ write_q15x2_ia (&pSi1, (q31_t) __PKHBT( out1, out2, 0 ));
+
+ /* Butterfly calculations */
+ /* U = packed(yd, xd) */
+ U = read_q15x2 (pSi3);
+ U = __SHADD16(U, 0);
+ U = __SHADD16(U, 0);
+
+ /* T = packed(yb-yd, xb-xd) */
+ T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __QSAX(S, T);
+ /* S = packed((ya-yc) + (xb- xd), (xa-xc) - (yb-yd)) */
+ S = __QASX(S, T);
+#else
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __QASX(S, T);
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __QSAX(S, T);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* co1 & si1 are read from SIMD Coefficient pointer */
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ /* Butterfly process for the i0+fftLen/2 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ out1 = __SMUSD(C1, S) >> 16U;
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ out2 = __SMUADX(C1, S);
+#else
+ /* xb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ out1 = __SMUADX(C1, S) >> 16U;
+ /* yb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ out2 = __SMUSD(__QSUB16(0, C1), S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* writing output(xb', yb') in little endian format */
+ write_q15x2_ia (&pSi2, __PKHBT( out1, out2, 0 ));
+
+ /* co3 & si3 are read from SIMD Coefficient pointer */
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+ /* Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ out1 = __SMUSD(C3, R) >> 16U;
+ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ out2 = __SMUADX(C3, R);
+#else
+ /* xd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ out1 = __SMUADX(C3, R) >> 16U;
+ /* yd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ out2 = __SMUSD(__QSUB16(0, C3), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* writing output(xd', yd') in little endian format */
+ write_q15x2_ia (&pSi3, __PKHBT( out1, out2, 0 ));
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ } while (--j);
+ /* data is in 4.11(q11) format */
+
+ /* end of first stage process */
+
+
+ /* start of middle stage process */
+
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of Middle stage */
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the middle stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ic = 0U;
+
+ for (j = 0U; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ C1 = read_q15x2 ((q15_t *) pCoef16 + (2U * ic));
+ C2 = read_q15x2 ((q15_t *) pCoef16 + (4U * ic));
+ C3 = read_q15x2 ((q15_t *) pCoef16 + (6U * ic));
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ pSi0 = pSrc16 + 2 * j;
+ pSi1 = pSi0 + 2 * n2;
+ pSi2 = pSi1 + 2 * n2;
+ pSi3 = pSi2 + 2 * n2;
+
+ /* Butterfly implementation */
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T = read_q15x2 (pSi0);
+
+ /* Read yc (real), xc(imag) input */
+ S = read_q15x2 (pSi2);
+
+ /* R = packed( (ya + yc), (xa + xc)) */
+ R = __QADD16(T, S);
+
+ /* S = packed((ya - yc), (xa - xc)) */
+ S = __QSUB16(T, S);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+
+ /* T = packed( (yb + yd), (xb + xd)) */
+ T = __QADD16(T, U);
+
+ /* writing the butterfly processed i0 sample */
+
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ out1 = __SHADD16(R, T);
+ out1 = __SHADD16(out1, 0);
+ write_q15x2 (pSi0, out1);
+ pSi0 += 2 * n1;
+
+ /* R = packed( (ya + yc) - (yb + yd), (xa + xc) - (xb + xd)) */
+ R = __SHSUB16(R, T);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+ out1 = __SMUSD(C2, R) >> 16U;
+
+ /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out2 = __SMUADX(C2, R);
+#else
+ /* (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ out1 = __SMUADX(R, C2) >> 16U;
+
+ /* (ya-yb+yc-yd)* (si2) + (xa-xb+xc-xd)* co2 */
+ out2 = __SMUSD(__QSUB16(0, C2), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Reading i0+3fftLen/4 */
+ /* Read yb (real), xb(imag) input */
+ T = read_q15x2 (pSi1);
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2) */
+ /* yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2) */
+ write_q15x2 (pSi1, __PKHBT( out1, out2, 0 ));
+ pSi1 += 2 * n1;
+
+ /* Butterfly calculations */
+
+ /* Read yd (real), xd(imag) input */
+ U = read_q15x2 (pSi3);
+
+ /* T = packed(yb-yd, xb-xd) */
+ T = __QSUB16(T, U);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __SHSAX(S, T);
+
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __SHASX(S, T);
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = __SMUSD(C1, S) >> 16U;
+ out2 = __SMUADX(C1, S);
+#else
+ /* R = packed((ya-yc) + (xb- xd) , (xa-xc) - (yb-yd)) */
+ R = __SHASX(S, T);
+
+ /* S = packed((ya-yc) - (xb- xd), (xa-xc) + (yb-yd)) */
+ S = __SHSAX(S, T);
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = __SMUADX(S, C1) >> 16U;
+ out2 = __SMUSD(__QSUB16(0, C1), S);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1) */
+ /* yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1) */
+ write_q15x2 (pSi2, __PKHBT( out1, out2, 0 ));
+ pSi2 += 2 * n1;
+
+ /* Butterfly process for the i0+3fftLen/4 sample */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ out1 = __SMUSD(C3, R) >> 16U;
+ out2 = __SMUADX(C3, R);
+#else
+ out1 = __SMUADX(C3, R) >> 16U;
+ out2 = __SMUSD(__QSUB16(0, C3), R);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3) */
+ /* yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3) */
+ write_q15x2 (pSi3, __PKHBT( out1, out2, 0 ));
+ pSi3 += 2 * n1;
+ }
+ }
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+ }
+ /* end of middle stage process */
+
+ /* data is in 10.6(q6) format for the 1024 point */
+ /* data is in 8.8(q8) format for the 256 point */
+ /* data is in 6.10(q10) format for the 64 point */
+ /* data is in 4.12(q12) format for the 16 point */
+
+ /* Initializations for the last stage */
+ j = fftLen >> 2;
+
+ ptr1 = &pSrc16[0];
+
+ /* start of last stage process */
+
+ /* Butterfly implementation */
+ do
+ {
+ /* Read xa (real), ya(imag) input */
+ xaya = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xb (real), yb(imag) input */
+ xbyb = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xc (real), yc(imag) input */
+ xcyc = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* Read xd (real), yd(imag) input */
+ xdyd = read_q15x2_ia ((q15_t **) &ptr1);
+
+ /* R = packed((ya + yc), (xa + xc)) */
+ R = __QADD16(xaya, xcyc);
+
+ /* T = packed((yb + yd), (xb + xd)) */
+ T = __QADD16(xbyb, xdyd);
+
+ /* pointer updation for writing */
+ ptr1 = ptr1 - 8U;
+
+
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ write_q15x2_ia (&ptr1, __SHADD16(R, T));
+
+ /* T = packed((yb + yd), (xb + xd)) */
+ T = __QADD16(xbyb, xdyd);
+
+ /* xc' = (xa-xb+xc-xd) */
+ /* yc' = (ya-yb+yc-yd) */
+ write_q15x2_ia (&ptr1, __SHSUB16(R, T));
+
+ /* S = packed((ya - yc), (xa - xc)) */
+ S = __QSUB16(xaya, xcyc);
+
+ /* Read yd (real), xd(imag) input */
+ /* T = packed( (yb - yd), (xb - xd)) */
+ U = __QSUB16(xbyb, xdyd);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* xb' = (xa+yb-xc-yd) */
+ /* yb' = (ya-xb-yc+xd) */
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
+
+ /* xd' = (xa-yb-xc+yd) */
+ /* yd' = (ya+xb-yc-xd) */
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
+#else
+ /* xb' = (xa+yb-xc-yd) */
+ /* yb' = (ya-xb-yc+xd) */
+ write_q15x2_ia (&ptr1, __SHSAX(S, U));
+
+ /* xd' = (xa-yb-xc+yd) */
+ /* yd' = (ya+xb-yc-xd) */
+ write_q15x2_ia (&ptr1, __SHASX(S, U));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ } while (--j);
+
+ /* end of last stage process */
+
+ /* output is in 11.5(q5) format for the 1024 point */
+ /* output is in 9.7(q7) format for the 256 point */
+ /* output is in 7.9(q9) format for the 64 point */
+ /* output is in 5.11(q11) format for the 16 point */
+
+
+#else /* arm_radix4_butterfly_inverse_q15 */
+
+ q15_t R0, R1, S0, S1, T0, T1, U0, U1;
+ q15_t Co1, Si1, Co2, Si2, Co3, Si3, out1, out2;
+ uint32_t n1, n2, ic, i0, i1, i2, i3, j, k;
+
+ /* Total process is divided into three stages */
+
+ /* process first stage, middle stages, & last stage */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+
+ /* Index for twiddle coefficient */
+ ic = 0U;
+
+ /* Index for input read and output write */
+ i0 = 0U;
+
+ j = n2;
+
+ /* Input is in 1.15(q15) format */
+
+ /* Start of first stage process */
+ do
+ {
+ /* Butterfly implementation */
+
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* input is down scale by 4 to avoid overflow */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U] >> 2U;
+ T1 = pSrc16[(i0 * 2U) + 1U] >> 2U;
+ /* input is down scale by 4 to avoid overflow */
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U] >> 2U;
+ S1 = pSrc16[(i2 * 2U) + 1U] >> 2U;
+
+ /* R0 = (ya + yc), R1 = (xa + xc) */
+ R0 = __SSAT(T0 + S0, 16U);
+ R1 = __SSAT(T1 + S1, 16U);
+ /* S0 = (ya - yc), S1 = (xa - xc) */
+ S0 = __SSAT(T0 - S0, 16U);
+ S1 = __SSAT(T1 - S1, 16U);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* input is down scale by 4 to avoid overflow */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U] >> 2U;
+ T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+ /* Read yd (real), xd(imag) input */
+ /* input is down scale by 4 to avoid overflow */
+ U0 = pSrc16[i3 * 2U] >> 2U;
+ U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
+
+ /* T0 = (yb + yd), T1 = (xb + xd) */
+ T0 = __SSAT(T0 + U0, 16U);
+ T1 = __SSAT(T1 + U1, 16U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+ pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+ /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc)- (xb + xd) */
+ R0 = __SSAT(R0 - T0, 16U);
+ R1 = __SSAT(R1 - T1, 16U);
+ /* co2 & si2 are read from Coefficient pointer */
+ Co2 = pCoef16[2U * ic * 2U];
+ Si2 = pCoef16[(2U * ic * 2U) + 1U];
+ /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
+ out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16U);
+ /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+ out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16U);
+
+ /* Reading i0+fftLen/4 */
+ /* input is down scale by 4 to avoid overflow */
+ /* T0 = yb, T1 = xb */
+ T0 = pSrc16[i1 * 2U] >> 2U;
+ T1 = pSrc16[(i1 * 2U) + 1U] >> 2U;
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* writing output(xc', yc') in little endian format */
+ pSrc16[i1 * 2U] = out1;
+ pSrc16[(i1 * 2U) + 1U] = out2;
+
+ /* Butterfly calculations */
+ /* input is down scale by 4 to avoid overflow */
+ /* U0 = yd, U1 = xd) */
+ U0 = pSrc16[i3 * 2U] >> 2U;
+ U1 = pSrc16[(i3 * 2U) + 1U] >> 2U;
+
+ /* T0 = yb-yd, T1 = xb-xd) */
+ T0 = __SSAT(T0 - U0, 16U);
+ T1 = __SSAT(T1 - U1, 16U);
+ /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
+ R0 = (q15_t) __SSAT((q31_t) (S0 + T1), 16);
+ R1 = (q15_t) __SSAT((q31_t) (S1 - T0), 16);
+ /* S = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
+ S0 = (q15_t) __SSAT((q31_t) (S0 - T1), 16);
+ S1 = (q15_t) __SSAT((q31_t) (S1 + T0), 16);
+
+ /* co1 & si1 are read from Coefficient pointer */
+ Co1 = pCoef16[ic * 2U];
+ Si1 = pCoef16[(ic * 2U) + 1U];
+ /* Butterfly process for the i0+fftLen/2 sample */
+ /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
+ out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
+ /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
+ out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
+ /* writing output(xb', yb') in little endian format */
+ pSrc16[i2 * 2U] = out1;
+ pSrc16[(i2 * 2U) + 1U] = out2;
+
+ /* Co3 & si3 are read from Coefficient pointer */
+ Co3 = pCoef16[3U * ic * 2U];
+ Si3 = pCoef16[(3U * ic * 2U) + 1U];
+ /* Butterfly process for the i0+3fftLen/4 sample */
+ /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
+ out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
+ /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
+ out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
+ /* writing output(xd', yd') in little endian format */
+ pSrc16[i3 * 2U] = out1;
+ pSrc16[(i3 * 2U) + 1U] = out2;
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+
+ /* End of first stage process */
+
+ /* data is in 4.11(q11) format */
+
+
+ /* Start of Middle stage process */
+
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of Middle stage */
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the middle stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ic = 0U;
+
+ for (j = 0U; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ Co1 = pCoef16[ic * 2U];
+ Si1 = pCoef16[(ic * 2U) + 1U];
+ Co2 = pCoef16[2U * ic * 2U];
+ Si2 = pCoef16[2U * ic * 2U + 1U];
+ Co3 = pCoef16[3U * ic * 2U];
+ Si3 = pCoef16[(3U * ic * 2U) + 1U];
+
+ /* Twiddle coefficients index modifier */
+ ic = ic + twidCoefModifier;
+
+ /* Butterfly implementation */
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U];
+ T1 = pSrc16[(i0 * 2U) + 1U];
+
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U];
+ S1 = pSrc16[(i2 * 2U) + 1U];
+
+
+ /* R0 = (ya + yc), R1 = (xa + xc) */
+ R0 = __SSAT(T0 + S0, 16U);
+ R1 = __SSAT(T1 + S1, 16U);
+ /* S0 = (ya - yc), S1 = (xa - xc) */
+ S0 = __SSAT(T0 - S0, 16U);
+ S1 = __SSAT(T1 - S1, 16U);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+ /* T0 = (yb + yd), T1 = (xb + xd) */
+ T0 = __SSAT(T0 + U0, 16U);
+ T1 = __SSAT(T1 + U1, 16U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ pSrc16[i0 * 2U] = ((R0 >> 1U) + (T0 >> 1U)) >> 1U;
+ pSrc16[(i0 * 2U) + 1U] = ((R1 >> 1U) + (T1 >> 1U)) >> 1U;
+
+ /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+ R0 = (R0 >> 1U) - (T0 >> 1U);
+ R1 = (R1 >> 1U) - (T1 >> 1U);
+
+ /* (ya-yb+yc-yd)* (si2) - (xa-xb+xc-xd)* co2 */
+ out1 = (q15_t) ((Co2 * R0 - Si2 * R1) >> 16);
+ /* (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+ out2 = (q15_t) ((Si2 * R0 + Co2 * R1) >> 16);
+
+ /* Reading i0+3fftLen/4 */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2) */
+ /* yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2) */
+ pSrc16[i1 * 2U] = out1;
+ pSrc16[(i1 * 2U) + 1U] = out2;
+
+ /* Butterfly calculations */
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+ /* T0 = yb-yd, T1 = xb-xd) */
+ T0 = __SSAT(T0 - U0, 16U);
+ T1 = __SSAT(T1 - U1, 16U);
+
+ /* R0 = (ya-yc) - (xb- xd) , R1 = (xa-xc) + (yb-yd) */
+ R0 = (S0 >> 1U) + (T1 >> 1U);
+ R1 = (S1 >> 1U) - (T0 >> 1U);
+
+ /* S1 = (ya-yc) + (xb- xd), S1 = (xa-xc) - (yb-yd) */
+ S0 = (S0 >> 1U) - (T1 >> 1U);
+ S1 = (S1 >> 1U) + (T0 >> 1U);
+
+ /* Butterfly process for the i0+fftLen/2 sample */
+ out1 = (q15_t) ((Co1 * S0 - Si1 * S1) >> 16U);
+ out2 = (q15_t) ((Si1 * S0 + Co1 * S1) >> 16U);
+ /* xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1) */
+ /* yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1) */
+ pSrc16[i2 * 2U] = out1;
+ pSrc16[(i2 * 2U) + 1U] = out2;
+
+ /* Butterfly process for the i0+3fftLen/4 sample */
+ out1 = (q15_t) ((Co3 * R0 - Si3 * R1) >> 16U);
+
+ out2 = (q15_t) ((Si3 * R0 + Co3 * R1) >> 16U);
+ /* xd' = (xa+yb-xc-yd)* Co3 - (ya-xb-yc+xd)* (si3) */
+ /* yd' = (ya-xb-yc+xd)* Co3 + (xa+yb-xc-yd)* (si3) */
+ pSrc16[i3 * 2U] = out1;
+ pSrc16[(i3 * 2U) + 1U] = out2;
+
+
+ }
+ }
+ /* Twiddle coefficients index modifier */
+ twidCoefModifier <<= 2U;
+ }
+ /* End of Middle stages process */
+
+
+ /* data is in 10.6(q6) format for the 1024 point */
+ /* data is in 8.8(q8) format for the 256 point */
+ /* data is in 6.10(q10) format for the 64 point */
+ /* data is in 4.12(q12) format for the 16 point */
+
+ /* start of last stage process */
+
+
+ /* Initializations for the last stage */
+ n1 = n2;
+ n2 >>= 2U;
+
+ /* Butterfly implementation */
+ for (i0 = 0U; i0 <= (fftLen - n1); i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc16[i0 + 0], pSrc16[i0 + fftLen/4], pSrc16[i0 + fftLen/2], pSrc16[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Reading i0, i0+fftLen/2 inputs */
+ /* Read ya (real), xa(imag) input */
+ T0 = pSrc16[i0 * 2U];
+ T1 = pSrc16[(i0 * 2U) + 1U];
+ /* Read yc (real), xc(imag) input */
+ S0 = pSrc16[i2 * 2U];
+ S1 = pSrc16[(i2 * 2U) + 1U];
+
+ /* R0 = (ya + yc), R1 = (xa + xc) */
+ R0 = __SSAT(T0 + S0, 16U);
+ R1 = __SSAT(T1 + S1, 16U);
+ /* S0 = (ya - yc), S1 = (xa - xc) */
+ S0 = __SSAT(T0 - S0, 16U);
+ S1 = __SSAT(T1 - S1, 16U);
+
+ /* Reading i0+fftLen/4 , i0+3fftLen/4 inputs */
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+
+ /* T0 = (yb + yd), T1 = (xb + xd) */
+ T0 = __SSAT(T0 + U0, 16U);
+ T1 = __SSAT(T1 + U1, 16U);
+
+ /* writing the butterfly processed i0 sample */
+ /* xa' = xa + xb + xc + xd */
+ /* ya' = ya + yb + yc + yd */
+ pSrc16[i0 * 2U] = (R0 >> 1U) + (T0 >> 1U);
+ pSrc16[(i0 * 2U) + 1U] = (R1 >> 1U) + (T1 >> 1U);
+
+ /* R0 = (ya + yc) - (yb + yd), R1 = (xa + xc) - (xb + xd) */
+ R0 = (R0 >> 1U) - (T0 >> 1U);
+ R1 = (R1 >> 1U) - (T1 >> 1U);
+
+ /* Read yb (real), xb(imag) input */
+ T0 = pSrc16[i1 * 2U];
+ T1 = pSrc16[(i1 * 2U) + 1U];
+
+ /* writing the butterfly processed i0 + fftLen/4 sample */
+ /* xc' = (xa-xb+xc-xd) */
+ /* yc' = (ya-yb+yc-yd) */
+ pSrc16[i1 * 2U] = R0;
+ pSrc16[(i1 * 2U) + 1U] = R1;
+
+ /* Read yd (real), xd(imag) input */
+ U0 = pSrc16[i3 * 2U];
+ U1 = pSrc16[(i3 * 2U) + 1U];
+ /* T0 = (yb - yd), T1 = (xb - xd) */
+ T0 = __SSAT(T0 - U0, 16U);
+ T1 = __SSAT(T1 - U1, 16U);
+
+ /* writing the butterfly processed i0 + fftLen/2 sample */
+ /* xb' = (xa-yb-xc+yd) */
+ /* yb' = (ya+xb-yc-xd) */
+ pSrc16[i2 * 2U] = (S0 >> 1U) - (T1 >> 1U);
+ pSrc16[(i2 * 2U) + 1U] = (S1 >> 1U) + (T0 >> 1U);
+
+
+ /* writing the butterfly processed i0 + 3fftLen/4 sample */
+ /* xd' = (xa+yb-xc-yd) */
+ /* yd' = (ya-xb-yc+xd) */
+ pSrc16[i3 * 2U] = (S0 >> 1U) + (T1 >> 1U);
+ pSrc16[(i3 * 2U) + 1U] = (S1 >> 1U) - (T0 >> 1U);
+ }
+ /* end of last stage process */
+
+ /* output is in 11.5(q5) format for the 1024 point */
+ /* output is in 9.7(q7) format for the 256 point */
+ /* output is in 7.9(q9) format for the 64 point */
+ /* output is in 5.11(q11) format for the 16 point */
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
new file mode 100644
index 0000000000..d6f5f29a45
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix4_q31.c
@@ -0,0 +1,827 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix4_q31.c
+ * Description: This file has function definition of Radix-4 FFT & IFFT function and
+ * In-place bit reversal using bit reversal table
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+void arm_radix4_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
+
+void arm_radix4_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier);
+
+void arm_bitreversal_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup ComplexFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 CFFT/CIFFT.
+ @deprecated Do not use this function. It has been superseded by \ref arm_cfft_q31 and will be removed in the future.
+ @param[in] S points to an instance of the Q31 CFFT/CIFFT structure
+ @param[in,out] pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place
+ @return none
+
+ @par Input and output formats:
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different FFT sizes.
+ The input and output formats for different FFT sizes and number of bits to upscale are mentioned in the tables below for CFFT and CIFFT:
+ @par
+ \image html CFFTQ31.gif "Input and Output Formats for Q31 CFFT"
+ \image html CIFFTQ31.gif "Input and Output Formats for Q31 CIFFT"
+ */
+
+void arm_cfft_radix4_q31(
+ const arm_cfft_radix4_instance_q31 * S,
+ q31_t * pSrc)
+{
+ if (S->ifftFlag == 1U)
+ {
+ /* Complex IFFT radix-4 */
+ arm_radix4_butterfly_inverse_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ /* Complex FFT radix-4 */
+ arm_radix4_butterfly_q31(pSrc, S->fftLen, S->pTwiddle, S->twidCoefModifier);
+ }
+
+ if (S->bitReverseFlag == 1U)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+/**
+ @} end of ComplexFFT group
+ */
+
+/*
+ * Radix-4 FFT algorithm used is :
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 FFT:
+ * Wn = co1 + j * (- si1)
+ * W2n = co2 + j * (- si2)
+ * W3n = co3 + j * (- si3)
+ *
+ * Butterfly implementation:
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa+yb-xc-yd)* co1 + (ya-xb-yc+xd)* (si1)
+ * yb' = (ya-xb-yc+xd)* co1 - (xa+yb-xc-yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 + (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 - (xa-xb+xc-xd)* (si2)
+ * xd' = (xa-yb-xc+yd)* co3 + (ya+xb-yc-xd)* (si3)
+ * yd' = (ya+xb-yc-xd)* co3 - (xa-yb-xc+yd)* (si3)
+ *
+ */
+
+/**
+ @brief Core function for the Q31 CFFT butterfly process.
+ @param[in,out] pSrc points to the in-place buffer of Q31 data type.
+ @param[in] fftLen length of the FFT.
+ @param[in] pCoef points to twiddle coefficient buffer.
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
+ */
+
+void arm_radix4_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier)
+{
+ uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+ q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+
+ q31_t xa, xb, xc, xd;
+ q31_t ya, yb, yc, yd;
+ q31_t xa_out, xb_out, xc_out, xd_out;
+ q31_t ya_out, yb_out, yc_out, yd_out;
+
+ q31_t *ptr1;
+
+ /* Total process is divided into three stages */
+
+ /* process first stage, middle stages, & last stage */
+
+
+ /* start of first stage process */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ /* Calculation of first stage */
+ do
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* input is in 1.31(q31) format and provide 4 guard bits for the input */
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = (pSrc[(2U * i0)] >> 4U) + (pSrc[(2U * i2)] >> 4U);
+ /* xa - xc */
+ r2 = (pSrc[(2U * i0)] >> 4U) - (pSrc[(2U * i2)] >> 4U);
+
+ /* xb + xd */
+ t1 = (pSrc[(2U * i1)] >> 4U) + (pSrc[(2U * i3)] >> 4U);
+
+ /* ya + yc */
+ s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
+ /* ya - yc */
+ s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1);
+ /* (xa + xc) - (xb + xd) */
+ r1 = r1 - t1;
+ /* yb + yd */
+ t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
+
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2);
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* yb - yd */
+ t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
+ /* xb - xd */
+ t2 = (pSrc[(2U * i1)] >> 4U) - (pSrc[(2U * i3)] >> 4U);
+
+ /* index calculation for the coefficients */
+ ia2 = 2U * ia1;
+ co2 = pCoef[(ia2 * 2U)];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
+ ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
+ ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
+
+ /* (xa - xc) + (yb - yd) */
+ r1 = r2 + t1;
+ /* (xa - xc) - (yb - yd) */
+ r2 = r2 - t1;
+
+ /* (ya - yc) - (xb - xd) */
+ s1 = s2 - t2;
+ /* (ya - yc) + (xb - xd) */
+ s2 = s2 + t2;
+
+ co1 = pCoef[(ia1 * 2U)];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
+ ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
+ ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
+
+ /* index calculation for the coefficients */
+ ia3 = 3U * ia1;
+ co3 = pCoef[(ia3 * 2U)];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
+ ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
+ ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+
+ /* end of first stage process */
+
+ /* data is in 5.27(q27) format */
+
+
+ /* start of Middle stages process */
+
+
+ /* each stage in middle stages provides two down scaling of the input */
+
+ twidCoefModifier <<= 2U;
+
+
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ /* Calculation of first stage */
+ for (j = 0U; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[(ia1 * 2U)];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[(ia2 * 2U)];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[(ia3 * 2U)];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = pSrc[2U * i0] + pSrc[2U * i2];
+ /* xa - xc */
+ r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1) >> 2U;
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 + (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) +
+ ((int32_t) (((q63_t) s1 * si2) >> 32))) >> 1U;
+
+ /* yc' = (ya-yb+yc-yd)co2 - (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) -
+ ((int32_t) (((q63_t) r1 * si2) >> 32))) >> 1U;
+
+ /* (xa - xc) + (yb - yd) */
+ r1 = r2 + t1;
+ /* (xa - xc) - (yb - yd) */
+ r2 = r2 - t1;
+
+ /* (ya - yc) - (xb - xd) */
+ s1 = s2 - t2;
+ /* (ya - yc) + (xb - xd) */
+ s2 = s2 + t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 + (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) +
+ ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
+
+ /* yb' = (ya-xb-yc+xd)co1 - (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) -
+ ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
+
+ /* xd' = (xa-yb-xc+yd)co3 + (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) +
+ ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
+
+ /* yd' = (ya+xb-yc-xd)co3 - (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) -
+ ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
+ }
+ }
+ twidCoefModifier <<= 2U;
+ }
+
+ /* End of Middle stages process */
+
+ /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
+ /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
+ /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
+ /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
+
+
+ /* start of Last stage process */
+ /* Initializations for the last stage */
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ /* Read xa (real), ya(imag) input */
+ xa = *ptr1++;
+ ya = *ptr1++;
+
+ /* Read xb (real), yb(imag) input */
+ xb = *ptr1++;
+ yb = *ptr1++;
+
+ /* Read xc (real), yc(imag) input */
+ xc = *ptr1++;
+ yc = *ptr1++;
+
+ /* Read xc (real), yc(imag) input */
+ xd = *ptr1++;
+ yd = *ptr1++;
+
+ /* xa' = xa + xb + xc + xd */
+ xa_out = xa + xb + xc + xd;
+
+ /* ya' = ya + yb + yc + yd */
+ ya_out = ya + yb + yc + yd;
+
+ /* pointer updation for writing */
+ ptr1 = ptr1 - 8U;
+
+ /* writing xa' and ya' */
+ *ptr1++ = xa_out;
+ *ptr1++ = ya_out;
+
+ xc_out = (xa - xb + xc - xd);
+ yc_out = (ya - yb + yc - yd);
+
+ /* writing xc' and yc' */
+ *ptr1++ = xc_out;
+ *ptr1++ = yc_out;
+
+ xb_out = (xa + yb - xc - yd);
+ yb_out = (ya - xb - yc + xd);
+
+ /* writing xb' and yb' */
+ *ptr1++ = xb_out;
+ *ptr1++ = yb_out;
+
+ xd_out = (xa - yb - xc + yd);
+ yd_out = (ya + xb - yc - xd);
+
+ /* writing xd' and yd' */
+ *ptr1++ = xd_out;
+ *ptr1++ = yd_out;
+
+
+ } while (--j);
+
+ /* output is in 11.21(q21) format for the 1024 point */
+ /* output is in 9.23(q23) format for the 256 point */
+ /* output is in 7.25(q25) format for the 64 point */
+ /* output is in 5.27(q27) format for the 16 point */
+
+ /* End of last stage process */
+
+}
+
+
+/**
+ @brief Core function for the Q31 CIFFT butterfly process.
+ @param[in,out] pSrc points to the in-place buffer of Q31 data type.
+ @param[in] fftLen length of the FFT.
+ @param[in] pCoef points to twiddle coefficient buffer.
+ @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ @return none
+ */
+
+/*
+ * Radix-4 IFFT algorithm used is :
+ *
+ * CIFFT uses same twiddle coefficients as CFFT Function
+ * x[k] = x[n] + (j)k * x[n + fftLen/4] + (-1)k * x[n+fftLen/2] + (-j)k * x[n+3*fftLen/4]
+ *
+ *
+ * IFFT is implemented with following changes in equations from FFT
+ *
+ * Input real and imaginary data:
+ * x(n) = xa + j * ya
+ * x(n+N/4 ) = xb + j * yb
+ * x(n+N/2 ) = xc + j * yc
+ * x(n+3N 4) = xd + j * yd
+ *
+ *
+ * Output real and imaginary data:
+ * x(4r) = xa'+ j * ya'
+ * x(4r+1) = xb'+ j * yb'
+ * x(4r+2) = xc'+ j * yc'
+ * x(4r+3) = xd'+ j * yd'
+ *
+ *
+ * Twiddle factors for radix-4 IFFT:
+ * Wn = co1 + j * (si1)
+ * W2n = co2 + j * (si2)
+ * W3n = co3 + j * (si3)
+
+ * The real and imaginary output values for the radix-4 butterfly are
+ * xa' = xa + xb + xc + xd
+ * ya' = ya + yb + yc + yd
+ * xb' = (xa-yb-xc+yd)* co1 - (ya+xb-yc-xd)* (si1)
+ * yb' = (ya+xb-yc-xd)* co1 + (xa-yb-xc+yd)* (si1)
+ * xc' = (xa-xb+xc-xd)* co2 - (ya-yb+yc-yd)* (si2)
+ * yc' = (ya-yb+yc-yd)* co2 + (xa-xb+xc-xd)* (si2)
+ * xd' = (xa+yb-xc-yd)* co3 - (ya-xb-yc+xd)* (si3)
+ * yd' = (ya-xb-yc+xd)* co3 + (xa+yb-xc-yd)* (si3)
+ *
+ */
+
+void arm_radix4_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pCoef,
+ uint32_t twidCoefModifier)
+{
+ uint32_t n1, n2, ia1, ia2, ia3, i0, i1, i2, i3, j, k;
+ q31_t t1, t2, r1, r2, s1, s2, co1, co2, co3, si1, si2, si3;
+ q31_t xa, xb, xc, xd;
+ q31_t ya, yb, yc, yd;
+ q31_t xa_out, xb_out, xc_out, xd_out;
+ q31_t ya_out, yb_out, yc_out, yd_out;
+
+ q31_t *ptr1;
+
+ /* input is be 1.31(q31) format for all FFT sizes */
+ /* Total process is divided into three stages */
+ /* process first stage, middle stages, & last stage */
+
+ /* Start of first stage process */
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+ n1 = n2;
+ /* n2 = fftLen/4 */
+ n2 >>= 2U;
+ i0 = 0U;
+ ia1 = 0U;
+
+ j = n2;
+
+ do
+ {
+ /* input is in 1.31(q31) format and provide 4 guard bits for the input */
+
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = (pSrc[2U * i0] >> 4U) + (pSrc[2U * i2] >> 4U);
+ /* xa - xc */
+ r2 = (pSrc[2U * i0] >> 4U) - (pSrc[2U * i2] >> 4U);
+
+ /* xb + xd */
+ t1 = (pSrc[2U * i1] >> 4U) + (pSrc[2U * i3] >> 4U);
+
+ /* ya + yc */
+ s1 = (pSrc[(2U * i0) + 1U] >> 4U) + (pSrc[(2U * i2) + 1U] >> 4U);
+ /* ya - yc */
+ s2 = (pSrc[(2U * i0) + 1U] >> 4U) - (pSrc[(2U * i2) + 1U] >> 4U);
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1);
+ /* (xa + xc) - (xb + xd) */
+ r1 = r1 - t1;
+ /* yb + yd */
+ t2 = (pSrc[(2U * i1) + 1U] >> 4U) + (pSrc[(2U * i3) + 1U] >> 4U);
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2);
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* yb - yd */
+ t1 = (pSrc[(2U * i1) + 1U] >> 4U) - (pSrc[(2U * i3) + 1U] >> 4U);
+ /* xb - xd */
+ t2 = (pSrc[2U * i1] >> 4U) - (pSrc[2U * i3] >> 4U);
+
+ /* index calculation for the coefficients */
+ ia2 = 2U * ia1;
+ co2 = pCoef[ia2 * 2U];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32)) -
+ ((int32_t) (((q63_t) s1 * si2) >> 32))) << 1U;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[2U * i1 + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32)) +
+ ((int32_t) (((q63_t) r1 * si2) >> 32))) << 1U;
+
+ /* (xa - xc) - (yb - yd) */
+ r1 = r2 - t1;
+ /* (xa - xc) + (yb - yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb - xd) */
+ s1 = s2 + t2;
+ /* (ya - yc) - (xb - xd) */
+ s2 = s2 - t2;
+
+ co1 = pCoef[ia1 * 2U];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
+ ((int32_t) (((q63_t) s1 * si1) >> 32))) << 1U;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
+ ((int32_t) (((q63_t) r1 * si1) >> 32))) << 1U;
+
+ /* index calculation for the coefficients */
+ ia3 = 3U * ia1;
+ co3 = pCoef[ia3 * 2U];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[2U * i3] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
+ ((int32_t) (((q63_t) s2 * si3) >> 32))) << 1U;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
+ ((int32_t) (((q63_t) r2 * si3) >> 32))) << 1U;
+
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ /* Updating input index */
+ i0 = i0 + 1U;
+
+ } while (--j);
+
+ /* data is in 5.27(q27) format */
+ /* each stage provides two down scaling of the input */
+
+
+ /* Start of Middle stages process */
+
+ twidCoefModifier <<= 2U;
+
+ /* Calculation of second stage to excluding last stage */
+ for (k = fftLen / 4U; k > 4U; k >>= 2U)
+ {
+ /* Initializations for the first stage */
+ n1 = n2;
+ n2 >>= 2U;
+ ia1 = 0U;
+
+ for (j = 0; j <= (n2 - 1U); j++)
+ {
+ /* index calculation for the coefficients */
+ ia2 = ia1 + ia1;
+ ia3 = ia2 + ia1;
+ co1 = pCoef[(ia1 * 2U)];
+ si1 = pCoef[(ia1 * 2U) + 1U];
+ co2 = pCoef[(ia2 * 2U)];
+ si2 = pCoef[(ia2 * 2U) + 1U];
+ co3 = pCoef[(ia3 * 2U)];
+ si3 = pCoef[(ia3 * 2U) + 1U];
+ /* Twiddle coefficients index modifier */
+ ia1 = ia1 + twidCoefModifier;
+
+ for (i0 = j; i0 < fftLen; i0 += n1)
+ {
+ /* index calculation for the input as, */
+ /* pSrc[i0 + 0], pSrc[i0 + fftLen/4], pSrc[i0 + fftLen/2U], pSrc[i0 + 3fftLen/4] */
+ i1 = i0 + n2;
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+
+ /* Butterfly implementation */
+ /* xa + xc */
+ r1 = pSrc[2U * i0] + pSrc[2U * i2];
+ /* xa - xc */
+ r2 = pSrc[2U * i0] - pSrc[2U * i2];
+
+ /* ya + yc */
+ s1 = pSrc[(2U * i0) + 1U] + pSrc[(2U * i2) + 1U];
+ /* ya - yc */
+ s2 = pSrc[(2U * i0) + 1U] - pSrc[(2U * i2) + 1U];
+
+ /* xb + xd */
+ t1 = pSrc[2U * i1] + pSrc[2U * i3];
+
+ /* xa' = xa + xb + xc + xd */
+ pSrc[2U * i0] = (r1 + t1) >> 2U;
+ /* xa + xc -(xb + xd) */
+ r1 = r1 - t1;
+ /* yb + yd */
+ t2 = pSrc[(2U * i1) + 1U] + pSrc[(2U * i3) + 1U];
+ /* ya' = ya + yb + yc + yd */
+ pSrc[(2U * i0) + 1U] = (s1 + t2) >> 2U;
+
+ /* (ya + yc) - (yb + yd) */
+ s1 = s1 - t2;
+
+ /* (yb - yd) */
+ t1 = pSrc[(2U * i1) + 1U] - pSrc[(2U * i3) + 1U];
+ /* (xb - xd) */
+ t2 = pSrc[2U * i1] - pSrc[2U * i3];
+
+ /* xc' = (xa-xb+xc-xd)co2 - (ya-yb+yc-yd)(si2) */
+ pSrc[2U * i1] = (((int32_t) (((q63_t) r1 * co2) >> 32U)) -
+ ((int32_t) (((q63_t) s1 * si2) >> 32U))) >> 1U;
+
+ /* yc' = (ya-yb+yc-yd)co2 + (xa-xb+xc-xd)(si2) */
+ pSrc[(2U * i1) + 1U] = (((int32_t) (((q63_t) s1 * co2) >> 32U)) +
+ ((int32_t) (((q63_t) r1 * si2) >> 32U))) >> 1U;
+
+ /* (xa - xc) - (yb - yd) */
+ r1 = r2 - t1;
+ /* (xa - xc) + (yb - yd) */
+ r2 = r2 + t1;
+
+ /* (ya - yc) + (xb - xd) */
+ s1 = s2 + t2;
+ /* (ya - yc) - (xb - xd) */
+ s2 = s2 - t2;
+
+ /* xb' = (xa+yb-xc-yd)co1 - (ya-xb-yc+xd)(si1) */
+ pSrc[2U * i2] = (((int32_t) (((q63_t) r1 * co1) >> 32)) -
+ ((int32_t) (((q63_t) s1 * si1) >> 32))) >> 1U;
+
+ /* yb' = (ya-xb-yc+xd)co1 + (xa+yb-xc-yd)(si1) */
+ pSrc[(2U * i2) + 1U] = (((int32_t) (((q63_t) s1 * co1) >> 32)) +
+ ((int32_t) (((q63_t) r1 * si1) >> 32))) >> 1U;
+
+ /* xd' = (xa-yb-xc+yd)co3 - (ya+xb-yc-xd)(si3) */
+ pSrc[(2U * i3)] = (((int32_t) (((q63_t) r2 * co3) >> 32)) -
+ ((int32_t) (((q63_t) s2 * si3) >> 32))) >> 1U;
+
+ /* yd' = (ya+xb-yc-xd)co3 + (xa-yb-xc+yd)(si3) */
+ pSrc[(2U * i3) + 1U] = (((int32_t) (((q63_t) s2 * co3) >> 32)) +
+ ((int32_t) (((q63_t) r2 * si3) >> 32))) >> 1U;
+ }
+ }
+ twidCoefModifier <<= 2U;
+ }
+
+ /* End of Middle stages process */
+
+ /* data is in 11.21(q21) format for the 1024 point as there are 3 middle stages */
+ /* data is in 9.23(q23) format for the 256 point as there are 2 middle stages */
+ /* data is in 7.25(q25) format for the 64 point as there are 1 middle stage */
+ /* data is in 5.27(q27) format for the 16 point as there are no middle stages */
+
+
+ /* Start of last stage process */
+
+
+ /* Initializations for the last stage */
+ j = fftLen >> 2;
+ ptr1 = &pSrc[0];
+
+ /* Calculations of last stage */
+ do
+ {
+ /* Read xa (real), ya(imag) input */
+ xa = *ptr1++;
+ ya = *ptr1++;
+
+ /* Read xb (real), yb(imag) input */
+ xb = *ptr1++;
+ yb = *ptr1++;
+
+ /* Read xc (real), yc(imag) input */
+ xc = *ptr1++;
+ yc = *ptr1++;
+
+ /* Read xc (real), yc(imag) input */
+ xd = *ptr1++;
+ yd = *ptr1++;
+
+ /* xa' = xa + xb + xc + xd */
+ xa_out = xa + xb + xc + xd;
+
+ /* ya' = ya + yb + yc + yd */
+ ya_out = ya + yb + yc + yd;
+
+ /* pointer updation for writing */
+ ptr1 = ptr1 - 8U;
+
+ /* writing xa' and ya' */
+ *ptr1++ = xa_out;
+ *ptr1++ = ya_out;
+
+ xc_out = (xa - xb + xc - xd);
+ yc_out = (ya - yb + yc - yd);
+
+ /* writing xc' and yc' */
+ *ptr1++ = xc_out;
+ *ptr1++ = yc_out;
+
+ xb_out = (xa - yb - xc + yd);
+ yb_out = (ya + xb - yc - xd);
+
+ /* writing xb' and yb' */
+ *ptr1++ = xb_out;
+ *ptr1++ = yb_out;
+
+ xd_out = (xa + yb - xc - yd);
+ yd_out = (ya - xb - yc + xd);
+
+ /* writing xd' and yd' */
+ *ptr1++ = xd_out;
+ *ptr1++ = yd_out;
+
+ } while (--j);
+
+ /* output is in 11.21(q21) format for the 1024 point */
+ /* output is in 9.23(q23) format for the 256 point */
+ /* output is in 7.25(q25) format for the 64 point */
+ /* output is in 5.27(q27) format for the 16 point */
+
+ /* End of last stage process */
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c
new file mode 100644
index 0000000000..a429d40fe2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f16.c
@@ -0,0 +1,289 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix8_f16.c
+ * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * -------------------------------------------------------------------- */
+
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type.
+ param[in] fftLen length of the FFT.
+ param[in] pCoef points to the twiddle coefficient buffer.
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ return none
+*/
+
+void arm_radix8_butterfly_f16(
+ float16_t * pSrc,
+ uint16_t fftLen,
+ const float16_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+ uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
+ uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
+ uint32_t id;
+ uint32_t n1, n2, j;
+
+ float16_t r1, r2, r3, r4, r5, r6, r7, r8;
+ float16_t t1, t2;
+ float16_t s1, s2, s3, s4, s5, s6, s7, s8;
+ float16_t p1, p2, p3, p4;
+ float16_t co2, co3, co4, co5, co6, co7, co8;
+ float16_t si2, si3, si4, si5, si6, si7, si8;
+ const float16_t C81 = 0.70710678118f;
+
+ n2 = fftLen;
+
+ do
+ {
+ n1 = n2;
+ n2 = n2 >> 3;
+ i1 = 0;
+
+ do
+ {
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+ i4 = i3 + n2;
+ i5 = i4 + n2;
+ i6 = i5 + n2;
+ i7 = i6 + n2;
+ i8 = i7 + n2;
+ r1 = pSrc[2 * i1] + pSrc[2 * i5];
+ r5 = pSrc[2 * i1] - pSrc[2 * i5];
+ r2 = pSrc[2 * i2] + pSrc[2 * i6];
+ r6 = pSrc[2 * i2] - pSrc[2 * i6];
+ r3 = pSrc[2 * i3] + pSrc[2 * i7];
+ r7 = pSrc[2 * i3] - pSrc[2 * i7];
+ r4 = pSrc[2 * i4] + pSrc[2 * i8];
+ r8 = pSrc[2 * i4] - pSrc[2 * i8];
+ t1 = r1 - r3;
+ r1 = r1 + r3;
+ r3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1] = r1 + r2;
+ pSrc[2 * i5] = r1 - r2;
+ r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+ s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+ r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+ s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+ s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+ s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+ r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+ s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+ t2 = r1 - s3;
+ r1 = r1 + s3;
+ s3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1 + 1] = r1 + r2;
+ pSrc[2 * i5 + 1] = r1 - r2;
+ pSrc[2 * i3] = t1 + s3;
+ pSrc[2 * i7] = t1 - s3;
+ pSrc[2 * i3 + 1] = t2 - r3;
+ pSrc[2 * i7 + 1] = t2 + r3;
+ r1 = (r6 - r8) * C81;
+ r6 = (r6 + r8) * C81;
+ r2 = (s6 - s8) * C81;
+ s6 = (s6 + s8) * C81;
+ t1 = r5 - r1;
+ r5 = r5 + r1;
+ r8 = r7 - r6;
+ r7 = r7 + r6;
+ t2 = s5 - r2;
+ s5 = s5 + r2;
+ s8 = s7 - s6;
+ s7 = s7 + s6;
+ pSrc[2 * i2] = r5 + s7;
+ pSrc[2 * i8] = r5 - s7;
+ pSrc[2 * i6] = t1 + s8;
+ pSrc[2 * i4] = t1 - s8;
+ pSrc[2 * i2 + 1] = s5 - r7;
+ pSrc[2 * i8 + 1] = s5 + r7;
+ pSrc[2 * i6 + 1] = t2 - r8;
+ pSrc[2 * i4 + 1] = t2 + r8;
+
+ i1 += n1;
+ } while (i1 < fftLen);
+
+ if (n2 < 8)
+ break;
+
+ ia1 = 0;
+ j = 1;
+
+ do
+ {
+ /* index calculation for the coefficients */
+ id = ia1 + twidCoefModifier;
+ ia1 = id;
+ ia2 = ia1 + id;
+ ia3 = ia2 + id;
+ ia4 = ia3 + id;
+ ia5 = ia4 + id;
+ ia6 = ia5 + id;
+ ia7 = ia6 + id;
+
+ co2 = pCoef[2 * ia1];
+ co3 = pCoef[2 * ia2];
+ co4 = pCoef[2 * ia3];
+ co5 = pCoef[2 * ia4];
+ co6 = pCoef[2 * ia5];
+ co7 = pCoef[2 * ia6];
+ co8 = pCoef[2 * ia7];
+ si2 = pCoef[2 * ia1 + 1];
+ si3 = pCoef[2 * ia2 + 1];
+ si4 = pCoef[2 * ia3 + 1];
+ si5 = pCoef[2 * ia4 + 1];
+ si6 = pCoef[2 * ia5 + 1];
+ si7 = pCoef[2 * ia6 + 1];
+ si8 = pCoef[2 * ia7 + 1];
+
+ i1 = j;
+
+ do
+ {
+ /* index calculation for the input */
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+ i4 = i3 + n2;
+ i5 = i4 + n2;
+ i6 = i5 + n2;
+ i7 = i6 + n2;
+ i8 = i7 + n2;
+ r1 = pSrc[2 * i1] + pSrc[2 * i5];
+ r5 = pSrc[2 * i1] - pSrc[2 * i5];
+ r2 = pSrc[2 * i2] + pSrc[2 * i6];
+ r6 = pSrc[2 * i2] - pSrc[2 * i6];
+ r3 = pSrc[2 * i3] + pSrc[2 * i7];
+ r7 = pSrc[2 * i3] - pSrc[2 * i7];
+ r4 = pSrc[2 * i4] + pSrc[2 * i8];
+ r8 = pSrc[2 * i4] - pSrc[2 * i8];
+ t1 = r1 - r3;
+ r1 = r1 + r3;
+ r3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1] = r1 + r2;
+ r2 = r1 - r2;
+ s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+ s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+ s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+ s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+ s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+ s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+ s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+ s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+ t2 = s1 - s3;
+ s1 = s1 + s3;
+ s3 = s2 - s4;
+ s2 = s2 + s4;
+ r1 = t1 + s3;
+ t1 = t1 - s3;
+ pSrc[2 * i1 + 1] = s1 + s2;
+ s2 = s1 - s2;
+ s1 = t2 - r3;
+ t2 = t2 + r3;
+ p1 = co5 * r2;
+ p2 = si5 * s2;
+ p3 = co5 * s2;
+ p4 = si5 * r2;
+ pSrc[2 * i5] = p1 + p2;
+ pSrc[2 * i5 + 1] = p3 - p4;
+ p1 = co3 * r1;
+ p2 = si3 * s1;
+ p3 = co3 * s1;
+ p4 = si3 * r1;
+ pSrc[2 * i3] = p1 + p2;
+ pSrc[2 * i3 + 1] = p3 - p4;
+ p1 = co7 * t1;
+ p2 = si7 * t2;
+ p3 = co7 * t2;
+ p4 = si7 * t1;
+ pSrc[2 * i7] = p1 + p2;
+ pSrc[2 * i7 + 1] = p3 - p4;
+ r1 = (r6 - r8) * C81;
+ r6 = (r6 + r8) * C81;
+ s1 = (s6 - s8) * C81;
+ s6 = (s6 + s8) * C81;
+ t1 = r5 - r1;
+ r5 = r5 + r1;
+ r8 = r7 - r6;
+ r7 = r7 + r6;
+ t2 = s5 - s1;
+ s5 = s5 + s1;
+ s8 = s7 - s6;
+ s7 = s7 + s6;
+ r1 = r5 + s7;
+ r5 = r5 - s7;
+ r6 = t1 + s8;
+ t1 = t1 - s8;
+ s1 = s5 - r7;
+ s5 = s5 + r7;
+ s6 = t2 - r8;
+ t2 = t2 + r8;
+ p1 = co2 * r1;
+ p2 = si2 * s1;
+ p3 = co2 * s1;
+ p4 = si2 * r1;
+ pSrc[2 * i2] = p1 + p2;
+ pSrc[2 * i2 + 1] = p3 - p4;
+ p1 = co8 * r5;
+ p2 = si8 * s5;
+ p3 = co8 * s5;
+ p4 = si8 * r5;
+ pSrc[2 * i8] = p1 + p2;
+ pSrc[2 * i8 + 1] = p3 - p4;
+ p1 = co6 * r6;
+ p2 = si6 * s6;
+ p3 = co6 * s6;
+ p4 = si6 * r6;
+ pSrc[2 * i6] = p1 + p2;
+ pSrc[2 * i6 + 1] = p3 - p4;
+ p1 = co4 * t1;
+ p2 = si4 * t2;
+ p3 = co4 * t2;
+ p4 = si4 * t1;
+ pSrc[2 * i4] = p1 + p2;
+ pSrc[2 * i4 + 1] = p3 - p4;
+
+ i1 += n1;
+ } while (i1 < fftLen);
+
+ j++;
+ } while (j < n2);
+
+ twidCoefModifier <<= 3;
+ } while (n2 > 7);
+}
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
new file mode 100644
index 0000000000..7990d4797c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_cfft_radix8_f32.c
@@ -0,0 +1,285 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_cfft_radix8_f32.c
+ * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+
+/* ----------------------------------------------------------------------
+ * Internal helper function used by the FFTs
+ * -------------------------------------------------------------------- */
+
+/**
+ brief Core function for the floating-point CFFT butterfly process.
+ param[in,out] pSrc points to the in-place buffer of floating-point data type.
+ param[in] fftLen length of the FFT.
+ param[in] pCoef points to the twiddle coefficient buffer.
+ param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ return none
+*/
+
+void arm_radix8_butterfly_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+ uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
+ uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
+ uint32_t id;
+ uint32_t n1, n2, j;
+
+ float32_t r1, r2, r3, r4, r5, r6, r7, r8;
+ float32_t t1, t2;
+ float32_t s1, s2, s3, s4, s5, s6, s7, s8;
+ float32_t p1, p2, p3, p4;
+ float32_t co2, co3, co4, co5, co6, co7, co8;
+ float32_t si2, si3, si4, si5, si6, si7, si8;
+ const float32_t C81 = 0.70710678118f;
+
+ n2 = fftLen;
+
+ do
+ {
+ n1 = n2;
+ n2 = n2 >> 3;
+ i1 = 0;
+
+ do
+ {
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+ i4 = i3 + n2;
+ i5 = i4 + n2;
+ i6 = i5 + n2;
+ i7 = i6 + n2;
+ i8 = i7 + n2;
+ r1 = pSrc[2 * i1] + pSrc[2 * i5];
+ r5 = pSrc[2 * i1] - pSrc[2 * i5];
+ r2 = pSrc[2 * i2] + pSrc[2 * i6];
+ r6 = pSrc[2 * i2] - pSrc[2 * i6];
+ r3 = pSrc[2 * i3] + pSrc[2 * i7];
+ r7 = pSrc[2 * i3] - pSrc[2 * i7];
+ r4 = pSrc[2 * i4] + pSrc[2 * i8];
+ r8 = pSrc[2 * i4] - pSrc[2 * i8];
+ t1 = r1 - r3;
+ r1 = r1 + r3;
+ r3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1] = r1 + r2;
+ pSrc[2 * i5] = r1 - r2;
+ r1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+ s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+ r2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+ s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+ s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+ s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+ r4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+ s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+ t2 = r1 - s3;
+ r1 = r1 + s3;
+ s3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1 + 1] = r1 + r2;
+ pSrc[2 * i5 + 1] = r1 - r2;
+ pSrc[2 * i3] = t1 + s3;
+ pSrc[2 * i7] = t1 - s3;
+ pSrc[2 * i3 + 1] = t2 - r3;
+ pSrc[2 * i7 + 1] = t2 + r3;
+ r1 = (r6 - r8) * C81;
+ r6 = (r6 + r8) * C81;
+ r2 = (s6 - s8) * C81;
+ s6 = (s6 + s8) * C81;
+ t1 = r5 - r1;
+ r5 = r5 + r1;
+ r8 = r7 - r6;
+ r7 = r7 + r6;
+ t2 = s5 - r2;
+ s5 = s5 + r2;
+ s8 = s7 - s6;
+ s7 = s7 + s6;
+ pSrc[2 * i2] = r5 + s7;
+ pSrc[2 * i8] = r5 - s7;
+ pSrc[2 * i6] = t1 + s8;
+ pSrc[2 * i4] = t1 - s8;
+ pSrc[2 * i2 + 1] = s5 - r7;
+ pSrc[2 * i8 + 1] = s5 + r7;
+ pSrc[2 * i6 + 1] = t2 - r8;
+ pSrc[2 * i4 + 1] = t2 + r8;
+
+ i1 += n1;
+ } while (i1 < fftLen);
+
+ if (n2 < 8)
+ break;
+
+ ia1 = 0;
+ j = 1;
+
+ do
+ {
+ /* index calculation for the coefficients */
+ id = ia1 + twidCoefModifier;
+ ia1 = id;
+ ia2 = ia1 + id;
+ ia3 = ia2 + id;
+ ia4 = ia3 + id;
+ ia5 = ia4 + id;
+ ia6 = ia5 + id;
+ ia7 = ia6 + id;
+
+ co2 = pCoef[2 * ia1];
+ co3 = pCoef[2 * ia2];
+ co4 = pCoef[2 * ia3];
+ co5 = pCoef[2 * ia4];
+ co6 = pCoef[2 * ia5];
+ co7 = pCoef[2 * ia6];
+ co8 = pCoef[2 * ia7];
+ si2 = pCoef[2 * ia1 + 1];
+ si3 = pCoef[2 * ia2 + 1];
+ si4 = pCoef[2 * ia3 + 1];
+ si5 = pCoef[2 * ia4 + 1];
+ si6 = pCoef[2 * ia5 + 1];
+ si7 = pCoef[2 * ia6 + 1];
+ si8 = pCoef[2 * ia7 + 1];
+
+ i1 = j;
+
+ do
+ {
+ /* index calculation for the input */
+ i2 = i1 + n2;
+ i3 = i2 + n2;
+ i4 = i3 + n2;
+ i5 = i4 + n2;
+ i6 = i5 + n2;
+ i7 = i6 + n2;
+ i8 = i7 + n2;
+ r1 = pSrc[2 * i1] + pSrc[2 * i5];
+ r5 = pSrc[2 * i1] - pSrc[2 * i5];
+ r2 = pSrc[2 * i2] + pSrc[2 * i6];
+ r6 = pSrc[2 * i2] - pSrc[2 * i6];
+ r3 = pSrc[2 * i3] + pSrc[2 * i7];
+ r7 = pSrc[2 * i3] - pSrc[2 * i7];
+ r4 = pSrc[2 * i4] + pSrc[2 * i8];
+ r8 = pSrc[2 * i4] - pSrc[2 * i8];
+ t1 = r1 - r3;
+ r1 = r1 + r3;
+ r3 = r2 - r4;
+ r2 = r2 + r4;
+ pSrc[2 * i1] = r1 + r2;
+ r2 = r1 - r2;
+ s1 = pSrc[2 * i1 + 1] + pSrc[2 * i5 + 1];
+ s5 = pSrc[2 * i1 + 1] - pSrc[2 * i5 + 1];
+ s2 = pSrc[2 * i2 + 1] + pSrc[2 * i6 + 1];
+ s6 = pSrc[2 * i2 + 1] - pSrc[2 * i6 + 1];
+ s3 = pSrc[2 * i3 + 1] + pSrc[2 * i7 + 1];
+ s7 = pSrc[2 * i3 + 1] - pSrc[2 * i7 + 1];
+ s4 = pSrc[2 * i4 + 1] + pSrc[2 * i8 + 1];
+ s8 = pSrc[2 * i4 + 1] - pSrc[2 * i8 + 1];
+ t2 = s1 - s3;
+ s1 = s1 + s3;
+ s3 = s2 - s4;
+ s2 = s2 + s4;
+ r1 = t1 + s3;
+ t1 = t1 - s3;
+ pSrc[2 * i1 + 1] = s1 + s2;
+ s2 = s1 - s2;
+ s1 = t2 - r3;
+ t2 = t2 + r3;
+ p1 = co5 * r2;
+ p2 = si5 * s2;
+ p3 = co5 * s2;
+ p4 = si5 * r2;
+ pSrc[2 * i5] = p1 + p2;
+ pSrc[2 * i5 + 1] = p3 - p4;
+ p1 = co3 * r1;
+ p2 = si3 * s1;
+ p3 = co3 * s1;
+ p4 = si3 * r1;
+ pSrc[2 * i3] = p1 + p2;
+ pSrc[2 * i3 + 1] = p3 - p4;
+ p1 = co7 * t1;
+ p2 = si7 * t2;
+ p3 = co7 * t2;
+ p4 = si7 * t1;
+ pSrc[2 * i7] = p1 + p2;
+ pSrc[2 * i7 + 1] = p3 - p4;
+ r1 = (r6 - r8) * C81;
+ r6 = (r6 + r8) * C81;
+ s1 = (s6 - s8) * C81;
+ s6 = (s6 + s8) * C81;
+ t1 = r5 - r1;
+ r5 = r5 + r1;
+ r8 = r7 - r6;
+ r7 = r7 + r6;
+ t2 = s5 - s1;
+ s5 = s5 + s1;
+ s8 = s7 - s6;
+ s7 = s7 + s6;
+ r1 = r5 + s7;
+ r5 = r5 - s7;
+ r6 = t1 + s8;
+ t1 = t1 - s8;
+ s1 = s5 - r7;
+ s5 = s5 + r7;
+ s6 = t2 - r8;
+ t2 = t2 + r8;
+ p1 = co2 * r1;
+ p2 = si2 * s1;
+ p3 = co2 * s1;
+ p4 = si2 * r1;
+ pSrc[2 * i2] = p1 + p2;
+ pSrc[2 * i2 + 1] = p3 - p4;
+ p1 = co8 * r5;
+ p2 = si8 * s5;
+ p3 = co8 * s5;
+ p4 = si8 * r5;
+ pSrc[2 * i8] = p1 + p2;
+ pSrc[2 * i8 + 1] = p3 - p4;
+ p1 = co6 * r6;
+ p2 = si6 * s6;
+ p3 = co6 * s6;
+ p4 = si6 * r6;
+ pSrc[2 * i6] = p1 + p2;
+ pSrc[2 * i6 + 1] = p3 - p4;
+ p1 = co4 * t1;
+ p2 = si4 * t2;
+ p3 = co4 * t2;
+ p4 = si4 * t1;
+ pSrc[2 * i4] = p1 + p2;
+ pSrc[2 * i4 + 1] = p3 - p4;
+
+ i1 += n1;
+ } while (i1 < fftLen);
+
+ j++;
+ } while (j < n2);
+
+ twidCoefModifier <<= 3;
+ } while (n2 > 7);
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
new file mode 100644
index 0000000000..2c3c3752ea
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_f32.c
@@ -0,0 +1,448 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_f32.c
+ * Description: Processing function of DCT4 & IDCT4 F32
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @defgroup DCT4_IDCT4 DCT Type IV Functions
+
+ Representation of signals by minimum number of values is important for storage and transmission.
+ The possibility of large discontinuity between the beginning and end of a period of a signal
+ in DFT can be avoided by extending the signal so that it is even-symmetric.
+ Discrete Cosine Transform (DCT) is constructed such that its energy is heavily concentrated in the lower part of the
+ spectrum and is very widely used in signal and image coding applications.
+ The family of DCTs (DCT type- 1,2,3,4) is the outcome of different combinations of homogeneous boundary conditions.
+ DCT has an excellent energy-packing capability, hence has many applications and in data compression in particular.
+
+ DCT is essentially the Discrete Fourier Transform(DFT) of an even-extended real signal.
+ Reordering of the input data makes the computation of DCT just a problem of
+ computing the DFT of a real signal with a few additional operations.
+ This approach provides regular, simple, and very efficient DCT algorithms for practical hardware and software implementations.
+
+ DCT type-II can be implemented using Fast fourier transform (FFT) internally, as the transform is applied on real values, Real FFT can be used.
+ DCT4 is implemented using DCT2 as their implementations are similar except with some added pre-processing and post-processing.
+ DCT2 implementation can be described in the following steps:
+ - Re-ordering input
+ - Calculating Real FFT
+ - Multiplication of weights and Real FFT output and getting real part from the product.
+
+ This process is explained by the block diagram below:
+ \image html DCT4.gif "Discrete Cosine Transform - type-IV"
+
+ @par Algorithm
+ The N-point type-IV DCT is defined as a real, linear transformation by the formula:
+ \image html DCT4Equation.gif
+ where k = 0, 1, 2, ..., N-1
+ @par
+ Its inverse is defined as follows:
+ \image html IDCT4Equation.gif
+ where n = 0, 1, 2, ..., N-1
+ @par
+ The DCT4 matrices become involutory (i.e. they are self-inverse) by multiplying with an overall scale factor of sqrt(2/N).
+ The symmetry of the transform matrix indicates that the fast algorithms for the forward
+ and inverse transform computation are identical.
+ Note that the implementation of Inverse DCT4 and DCT4 is same, hence same process function can be used for both.
+
+ @par Lengths supported by the transform:
+ As DCT4 internally uses Real FFT, it supports all the lengths 128, 512, 2048 and 8192.
+ The library provides separate functions for Q15, Q31, and floating-point data types.
+
+ @par Instance Structure
+ The instances for Real FFT and FFT, cosine values table and twiddle factor table are stored in an instance data structure.
+ A separate instance structure must be defined for each transform.
+ There are separate instance structure declarations for each of the 3 supported data types.
+
+ @par Initialization Functions
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Initializes Real FFT as its process function is used internally in DCT4, by calling \ref arm_rfft_init_f32().
+ @par
+ Use of the initialization function is optional.
+ However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ To place an instance structure into a const data section, the instance structure must be manually initialized.
+ Manually initialize the instance structure as follows:
+
+ arm_dct4_instance_f32 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+ arm_dct4_instance_q31 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+ arm_dct4_instance_q15 S = {N, Nby2, normalize, pTwiddle, pCosFactor, pRfft, pCfft};
+
+ where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
+ \c normalize is normalizing factor used and is equal to sqrt(2/N);
+ \c pTwiddle points to the twiddle factor table;
+ \c pCosFactor points to the cosFactor table;
+ \c pRfft points to the real FFT instance;
+ \c pCfft points to the complex FFT instance;
+ The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
+ and arm_rfft_f32() respectively for details regarding static initialization.
+
+ @par Fixed-Point Behavior
+ Care must be taken when using the fixed-point versions of the DCT4 transform functions.
+ In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ Refer to the function specific documentation below for usage guidelines.
+ */
+
+ /**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point DCT4/IDCT4.
+ @param[in] S points to an instance of the floating-point DCT4/IDCT4 structure
+ @param[in] pState points to state buffer
+ @param[in,out] pInlineBuffer points to the in-place input and output buffer
+ @return none
+ */
+
+void arm_dct4_f32(
+ const arm_dct4_instance_f32 * S,
+ float32_t * pState,
+ float32_t * pInlineBuffer)
+{
+ const float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */
+ const float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
+ float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
+ float32_t in; /* Temporary variable */
+ uint32_t i; /* Loop counter */
+
+
+ /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+ * along with some pre-processing and post-processing.
+ * Computational procedure is explained as follows:
+ * (a) Pre-processing involves multiplying input with cos factor,
+ * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+ * where,
+ * r(n) -- output of preprocessing
+ * u(n) -- input to preprocessing(actual Source buffer)
+ * (b) Calculation of DCT2 using FFT is divided into three steps:
+ * Step1: Re-ordering of even and odd elements of input.
+ * Step2: Calculating FFT of the re-ordered input.
+ * Step3: Taking the real part of the product of FFT output and weights.
+ * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * where,
+ * Y4 -- DCT4 output, Y2 -- DCT2 output
+ * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+ */
+
+ /*-------- Pre-processing ------------*/
+ /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+ arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
+ arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
+
+ /* ----------------------------------------------------------------
+ * Step1: Re-ordering of even and odd elements as
+ * pState[i] = pInlineBuffer[2*i] and
+ * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+ ---------------------------------------------------------------------*/
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+ pS2 = pState + (S->N - 1U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+ i = S->Nby2 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4.
+ * Compute 4 outputs at a time */
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+ i = (S->N - 1U) >> 2U;
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ * (float32_t) 0.5;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ i = (S->N - 1U) % 0x4U;
+
+ while (i > 0U)
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+#else
+
+ /* Initializing the loop counter to N/2 */
+ i = S->Nby2;
+
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter */
+ i = S->N;
+
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_f32 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32 (pState, weights, pState, S->N);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ * (float32_t) 0.5;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* Initializing the loop counter */
+ i = (S->N - 1U);
+
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing loop counter */
+ i = S->N;
+
+ /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
new file mode 100644
index 0000000000..97c26cafdb
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_f32.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_init_f32.c
+ * Description: Initialization function of DCT-4 & IDCT4 F32
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+ /**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point DCT4/IDCT4.
+ @param[in,out] S points to an instance of floating-point DCT4/IDCT4 structure
+ @param[in] S_RFFT points to an instance of floating-point RFFT/RIFFT structure
+ @param[in] S_CFFT points to an instance of floating-point CFFT/CIFFT structure
+ @param[in] N length of the DCT4
+ @param[in] Nby2 half of the length of the DCT4
+ @param[in] normalize normalizing factor.
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : N is not a supported transform length
+
+ @par Normalizing factor
+ The normalizing factor is sqrt(2/N), which depends on the size of transform N.
+ Floating-point normalizing factors are mentioned in the table below for different DCT sizes:
+
+ \image html dct4NormalizingF32Table.gif
+ */
+
+arm_status arm_dct4_init_f32(
+ arm_dct4_instance_f32 * S,
+ arm_rfft_instance_f32 * S_RFFT,
+ arm_cfft_radix4_instance_f32 * S_CFFT,
+ uint16_t N,
+ uint16_t Nby2,
+ float32_t normalize)
+{
+ /* Initialize the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+
+ /* Initialize the DCT4 length */
+ S->N = N;
+
+ /* Initialize the half of DCT4 length */
+ S->Nby2 = Nby2;
+
+ /* Initialize the DCT4 Normalizing factor */
+ S->normalize = normalize;
+
+ /* Initialize Real FFT Instance */
+ S->pRfft = S_RFFT;
+
+ /* Initialize Complex FFT Instance */
+ S->pCfft = S_CFFT;
+
+ switch (N)
+ {
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_8192)
+ /* Initialize the table modifier values */
+ case 8192U:
+ S->pTwiddle = Weights_8192;
+ S->pCosFactor = cos_factors_8192;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_2048)
+ case 2048U:
+ S->pTwiddle = Weights_2048;
+ S->pCosFactor = cos_factors_2048;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_512)
+ case 512U:
+ S->pTwiddle = Weights_512;
+ S->pCosFactor = cos_factors_512;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_F32_128)
+ case 128U:
+ S->pTwiddle = Weights_128;
+ S->pCosFactor = cos_factors_128;
+ break;
+ #endif
+ default:
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ /* Initialize the RFFT/RIFFT Function */
+ arm_rfft_init_f32(S->pRfft, S->pCfft, S->N, 0U, 1U);
+
+ /* return the status of DCT4 Init function */
+ return (status);
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
new file mode 100644
index 0000000000..8d9f070001
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q15.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_init_q15.c
+ * Description: Initialization function of DCT-4 & IDCT4 Q15
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+ /**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 DCT4/IDCT4.
+ @param[in,out] S points to an instance of Q15 DCT4/IDCT4 structure
+ @param[in] S_RFFT points to an instance of Q15 RFFT/RIFFT structure
+ @param[in] S_CFFT points to an instance of Q15 CFFT/CIFFT structure
+ @param[in] N length of the DCT4
+ @param[in] Nby2 half of the length of the DCT4
+ @param[in] normalize normalizing factor
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : N is not a supported transform length
+
+ @par Normalizing factor
+ The normalizing factor is sqrt(2/N), which depends on the size of transform N.
+ Normalizing factors in 1.15 format are mentioned in the table below for different DCT sizes:
+
+ \image html dct4NormalizingQ15Table.gif
+ */
+
+arm_status arm_dct4_init_q15(
+ arm_dct4_instance_q15 * S,
+ arm_rfft_instance_q15 * S_RFFT,
+ arm_cfft_radix4_instance_q15 * S_CFFT,
+ uint16_t N,
+ uint16_t Nby2,
+ q15_t normalize)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialize the DCT4 length */
+ S->N = N;
+
+ /* Initialize the half of DCT4 length */
+ S->Nby2 = Nby2;
+
+ /* Initialize the DCT4 Normalizing factor */
+ S->normalize = normalize;
+
+ /* Initialize Real FFT Instance */
+ S->pRfft = S_RFFT;
+
+ /* Initialize Complex FFT Instance */
+ S->pCfft = S_CFFT;
+
+ switch (N)
+ {
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_8192)
+ /* Initialize the table modifier values */
+ case 8192U:
+ S->pTwiddle = WeightsQ15_8192;
+ S->pCosFactor = cos_factorsQ15_8192;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_2048)
+ case 2048U:
+ S->pTwiddle = WeightsQ15_2048;
+ S->pCosFactor = cos_factorsQ15_2048;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_512)
+ case 512U:
+ S->pTwiddle = WeightsQ15_512;
+ S->pCosFactor = cos_factorsQ15_512;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q15_128)
+ case 128U:
+ S->pTwiddle = WeightsQ15_128;
+ S->pCosFactor = cos_factorsQ15_128;
+ break;
+ #endif
+
+ default:
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ /* Initialize the RFFT/RIFFT */
+ arm_rfft_init_q15(S->pRfft, S->N, 0U, 1U);
+
+ /* return the status of DCT4 Init function */
+ return (status);
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
new file mode 100644
index 0000000000..f927419804
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_init_q31.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_init_q31.c
+ * Description: Initialization function of DCT-4 & IDCT4 Q31
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+ /**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 DCT4/IDCT4.
+ @param[in,out] S points to an instance of Q31 DCT4/IDCT4 structure.
+ @param[in] S_RFFT points to an instance of Q31 RFFT/RIFFT structure
+ @param[in] S_CFFT points to an instance of Q31 CFFT/CIFFT structure
+ @param[in] N length of the DCT4.
+ @param[in] Nby2 half of the length of the DCT4.
+ @param[in] normalize normalizing factor.
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : N is not a supported transform length
+
+ @par Normalizing factor:
+ The normalizing factor is sqrt(2/N), which depends on the size of transform N.
+ Normalizing factors in 1.31 format are mentioned in the table below for different DCT sizes:
+
+ \image html dct4NormalizingQ31Table.gif
+ */
+
+arm_status arm_dct4_init_q31(
+ arm_dct4_instance_q31 * S,
+ arm_rfft_instance_q31 * S_RFFT,
+ arm_cfft_radix4_instance_q31 * S_CFFT,
+ uint16_t N,
+ uint16_t Nby2,
+ q31_t normalize)
+{
+ /* Initialize the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialize the DCT4 length */
+ S->N = N;
+
+ /* Initialize the half of DCT4 length */
+ S->Nby2 = Nby2;
+
+ /* Initialize the DCT4 Normalizing factor */
+ S->normalize = normalize;
+
+ /* Initialize Real FFT Instance */
+ S->pRfft = S_RFFT;
+
+ /* Initialize Complex FFT Instance */
+ S->pCfft = S_CFFT;
+
+ switch (N)
+ {
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_8192)
+ /* Initialize the table modifier values */
+ case 8192U:
+ S->pTwiddle = WeightsQ31_8192;
+ S->pCosFactor = cos_factorsQ31_8192;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_2048)
+ case 2048U:
+ S->pTwiddle = WeightsQ31_2048;
+ S->pCosFactor = cos_factorsQ31_2048;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_512)
+ case 512U:
+ S->pTwiddle = WeightsQ31_512;
+ S->pCosFactor = cos_factorsQ31_512;
+ break;
+ #endif
+
+ #if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_DCT4_Q31_128)
+ case 128U:
+ S->pTwiddle = WeightsQ31_128;
+ S->pCosFactor = cos_factorsQ31_128;
+ break;
+ #endif
+ default:
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ /* Initialize the RFFT/RIFFT Function */
+ arm_rfft_init_q31(S->pRfft, S->N, 0U, 1U);
+
+ /* return the status of DCT4 Init function */
+ return (status);
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
new file mode 100644
index 0000000000..80805210bd
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q15.c
@@ -0,0 +1,381 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_q15.c
+ * Description: Processing function of DCT4 & IDCT4 Q15
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 DCT4/IDCT4.
+ @param[in] S points to an instance of the Q15 DCT4 structure.
+ @param[in] pState points to state buffer.
+ @param[in,out] pInlineBuffer points to the in-place input and output buffer.
+ @return none
+
+ @par Input an output formats
+ Internally inputs are downscaled in the RFFT process function to avoid overflows.
+ Number of bits downscaled, depends on the size of the transform. The input and output
+ formats for different DCT sizes and number of bits to upscale are mentioned in the table below:
+
+ \image html dct4FormatsQ15Table.gif
+ */
+
+void arm_dct4_q15(
+ const arm_dct4_instance_q15 * S,
+ q15_t * pState,
+ q15_t * pInlineBuffer)
+{
+ const q15_t *weights = S->pTwiddle; /* Pointer to the Weights table */
+ const q15_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
+ q15_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
+ q15_t in; /* Temporary variable */
+ uint32_t i; /* Loop counter */
+
+
+ /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+ * along with some pre-processing and post-processing.
+ * Computational procedure is explained as follows:
+ * (a) Pre-processing involves multiplying input with cos factor,
+ * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+ * where,
+ * r(n) -- output of preprocessing
+ * u(n) -- input to preprocessing(actual Source buffer)
+ * (b) Calculation of DCT2 using FFT is divided into three steps:
+ * Step1: Re-ordering of even and odd elements of input.
+ * Step2: Calculating FFT of the re-ordered input.
+ * Step3: Taking the real part of the product of FFT output and weights.
+ * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * where,
+ * Y4 -- DCT4 output, Y2 -- DCT2 output
+ * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+ */
+
+ /*-------- Pre-processing ------------*/
+ /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+ arm_mult_q15 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
+ arm_shift_q15 (pInlineBuffer, 1, pInlineBuffer, S->N);
+
+ /* ----------------------------------------------------------------
+ * Step1: Re-ordering of even and odd elements as
+ * pState[i] = pInlineBuffer[2*i] and
+ * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+ ---------------------------------------------------------------------*/
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+ pS2 = pState + (S->N - 1U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+ i = S->Nby2 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4.
+ * Compute 4 outputs at a time */
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
+
+ /* The output of complex multiplication is in 3.13 format.
+ * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
+ arm_shift_q15 (pState, 2, pState, S->N * 2);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+ i = (S->N - 1U) >> 2U;
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ >> 1U;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ i = (S->N - 1U) % 0x4U;
+
+ while (i > 0U)
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+ in = *pbuff;
+ *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+ in = *pbuff;
+ *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+ in = *pbuff;
+ *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+
+#else
+
+ /* Initializing the loop counter to N/2 */
+ i = S->Nby2;
+
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter */
+ i = S->N;
+
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_q15 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_q15 (pState, weights, pState, S->N);
+
+ /* The output of complex multiplication is in 3.13 format.
+ * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.15 format by shifting left by 2 bits. */
+ arm_shift_q15 (pState, 2, pState, S->N * 2);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ >> 1U;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* Initializing the loop counter */
+ i = (S->N - 1U);
+
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing loop counter */
+ i = S->N;
+
+ /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = ((q15_t) (((q31_t) in * S->normalize) >> 15));
+
+ /* Decrement loop counter */
+ i--;
+
+ } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
new file mode 100644
index 0000000000..6cbccff81f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_dct4_q31.c
@@ -0,0 +1,383 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_dct4_q31.c
+ * Description: Processing function of DCT4 & IDCT4 Q31
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/**
+ @addtogroup DCT4_IDCT4
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 DCT4/IDCT4.
+ @param[in] S points to an instance of the Q31 DCT4 structure.
+ @param[in] pState points to state buffer.
+ @param[in,out] pInlineBuffer points to the in-place input and output buffer.
+ @return none
+
+ @par Input an output formats
+ Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
+ as the conversion from DCT2 to DCT4 involves one subtraction.
+ Internally inputs are downscaled in the RFFT process function to avoid overflows.
+ Number of bits downscaled, depends on the size of the transform.
+ The input and output formats for different DCT sizes and number of bits to upscale are
+ mentioned in the table below:
+
+ \image html dct4FormatsQ31Table.gif
+ */
+
+void arm_dct4_q31(
+ const arm_dct4_instance_q31 * S,
+ q31_t * pState,
+ q31_t * pInlineBuffer)
+{
+ const q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */
+ const q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
+ q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
+ q31_t in; /* Temporary variable */
+ uint32_t i; /* Loop counter */
+
+
+ /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+ * along with some pre-processing and post-processing.
+ * Computational procedure is explained as follows:
+ * (a) Pre-processing involves multiplying input with cos factor,
+ * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+ * where,
+ * r(n) -- output of preprocessing
+ * u(n) -- input to preprocessing(actual Source buffer)
+ * (b) Calculation of DCT2 using FFT is divided into three steps:
+ * Step1: Re-ordering of even and odd elements of input.
+ * Step2: Calculating FFT of the re-ordered input.
+ * Step3: Taking the real part of the product of FFT output and weights.
+ * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * where,
+ * Y4 -- DCT4 output, Y2 -- DCT2 output
+ * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+ */
+
+ /*-------- Pre-processing ------------*/
+ /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+ arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
+ arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
+
+ /* ----------------------------------------------------------------
+ * Step1: Re-ordering of even and odd elements as
+ * pState[i] = pInlineBuffer[2*i] and
+ * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+ ---------------------------------------------------------------------*/
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+ pS2 = pState + (S->N - 1U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+
+#if defined (ARM_MATH_LOOPUNROLL)
+
+ /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+ i = S->Nby2 >> 2U;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4.
+ * Compute 4 outputs at a time */
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
+
+ /* The output of complex multiplication is in 3.29 format.
+ * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
+ arm_shift_q31 (pState, 2, pState, S->N * 2);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+ i = (S->N - 1U) >> 2U;
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ >> 1U;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ i = (S->N - 1U) % 0x4U;
+
+ while (i > 0U)
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = S->N >> 2U;
+
+ /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+ in = *pbuff;
+ *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+ in = *pbuff;
+ *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+ in = *pbuff;
+ *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+
+#else
+
+ /* Initializing the loop counter to N/2 */
+ i = S->Nby2;
+
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter */
+ i = S->N;
+
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while (i > 0U);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
+
+ /* The output of complex multiplication is in 3.29 format.
+ * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
+ arm_shift_q31(pState, 2, pState, S->N * 2);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ >> 1U;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* Initializing the loop counter */
+ i = (S->N - 1U);
+
+ while (i > 0U)
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement loop counter */
+ i--;
+ }
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing loop counter */
+ i = S->N;
+
+ /* pbuff initialized to the pInlineBuffer (now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
+
+ /* Decrement loop counter */
+ i--;
+ } while (i > 0U);
+
+#endif /* #if defined (ARM_MATH_LOOPUNROLL) */
+
+}
+
+/**
+ @} end of DCT4_IDCT4 group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
new file mode 100644
index 0000000000..856388503b
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_f32.c
@@ -0,0 +1,318 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_f32.c
+ * Description: RFFT & RIFFT Floating point process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+extern void arm_radix4_butterfly_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier);
+
+extern void arm_radix4_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint16_t fftLen,
+ const float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen);
+
+extern void arm_bitreversal_f32(
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ const uint16_t * pBitRevTab);
+
+void arm_split_rfft_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pATable,
+ const float32_t * pBTable,
+ float32_t * pDst,
+ uint32_t modifier);
+
+void arm_split_rifft_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pATable,
+ const float32_t * pBTable,
+ float32_t * pDst,
+ uint32_t modifier);
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the floating-point RFFT/RIFFT.
+ Source buffer is modified by this function.
+
+ @deprecated Do not use this function. It has been superceded by \ref arm_rfft_fast_f32 and will be removed in the future.
+ @param[in] S points to an instance of the floating-point RFFT/RIFFT structure
+ @param[in] pSrc points to the input buffer
+ @param[out] pDst points to the output buffer
+ @return none
+
+ @par
+ For the RIFFT, the source buffer must at least have length
+ fftLenReal + 2.
+ The last two elements must be equal to what would be generated
+ by the RFFT:
+ (pSrc[0] - pSrc[1]) and 0.0f
+ */
+
+void arm_rfft_f32(
+ const arm_rfft_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst)
+{
+ const arm_cfft_radix4_instance_f32 *S_CFFT = S->pCfft;
+
+ /* Calculation of Real IFFT of input */
+ if (S->ifftFlagR == 1U)
+ {
+ /* Real IFFT core process */
+ arm_split_rifft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+
+ /* Complex radix-4 IFFT process */
+ arm_radix4_butterfly_inverse_f32 (pDst, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier, S_CFFT->onebyfftLen);
+
+ /* Bit reversal process */
+ if (S->bitReverseFlagR == 1U)
+ {
+ arm_bitreversal_f32 (pDst, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
+ }
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
+
+ /* Complex radix-4 FFT process */
+ arm_radix4_butterfly_f32 (pSrc, S_CFFT->fftLen, S_CFFT->pTwiddle, S_CFFT->twidCoefModifier);
+
+ /* Bit reversal process */
+ if (S->bitReverseFlagR == 1U)
+ {
+ arm_bitreversal_f32 (pSrc, S_CFFT->fftLen, S_CFFT->bitRevFactor, S_CFFT->pBitRevTable);
+ }
+
+ /* Real FFT core process */
+ arm_split_rfft_f32 (pSrc, S->fftLenBy2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ }
+
+}
+
+/**
+ @} end of RealFFT group
+ */
+
+/**
+ @brief Core Real FFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
+void arm_split_rfft_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pATable,
+ const float32_t * pBTable,
+ float32_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ float32_t outR, outI; /* Temporary variables for output */
+ const float32_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ float32_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ float32_t *pDst1 = &pDst[2], *pDst2 = &pDst[(4U * fftLen) - 1U]; /* temp pointers for output buffer */
+ float32_t *pSrc1 = &pSrc[2], *pSrc2 = &pSrc[(2U * fftLen) - 1U]; /* temp pointers for input buffer */
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
+
+ i = fftLen - 1U;
+
+ while (i > 0U)
+ {
+ /*
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+ /* read pATable[2 * i] */
+ CoefA1 = *pCoefA++;
+ /* pATable[2 * i + 1] */
+ CoefA2 = *pCoefA;
+
+ /* pSrc[2 * i] * pATable[2 * i] */
+ outR = *pSrc1 * CoefA1;
+ /* pSrc[2 * i] * CoefA2 */
+ outI = *pSrc1++ * CoefA2;
+
+ /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
+ outR -= (*pSrc1 + *pSrc2) * CoefA2;
+ /* pSrc[2 * i + 1] * CoefA1 */
+ outI += *pSrc1++ * CoefA1;
+
+ CoefB1 = *pCoefB;
+
+ /* pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
+ outI -= *pSrc2-- * CoefB1;
+ /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
+ outI -= *pSrc2 * CoefA2;
+
+ /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
+ outR += *pSrc2-- * CoefB1;
+
+ /* write output */
+ *pDst1++ = outR;
+ *pDst1++ = outI;
+
+ /* write complex conjugate output */
+ *pDst2-- = -outI;
+ *pDst2-- = outR;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (modifier * 2U);
+ pCoefA = pCoefA + ((modifier * 2U) - 1U);
+
+ i--;
+
+ }
+
+ pDst[2U * fftLen] = pSrc[0] - pSrc[1];
+ pDst[(2U * fftLen) + 1U] = 0.0f;
+
+ pDst[0] = pSrc[0] + pSrc[1];
+ pDst[1] = 0.0f;
+
+}
+
+
+/**
+ @brief Core Real IFFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
+void arm_split_rifft_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ const float32_t * pATable,
+ const float32_t * pBTable,
+ float32_t * pDst,
+ uint32_t modifier)
+{
+ float32_t outR, outI; /* Temporary variables for output */
+ const float32_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ float32_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ float32_t *pSrc1 = &pSrc[0], *pSrc2 = &pSrc[(2U * fftLen) + 1U];
+
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
+
+ while (fftLen > 0U)
+ {
+ /*
+ outR = ( pIn[2 * i] * pATable[2 * i]
+ + pIn[2 * i + 1] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ - pIn[2 * i] * pATable[2 * i + 1]
+ - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+ CoefA1 = *pCoefA++;
+ CoefA2 = *pCoefA;
+
+ /* outR = (pSrc[2 * i] * CoefA1 */
+ outR = *pSrc1 * CoefA1;
+
+ /* - pSrc[2 * i] * CoefA2 */
+ outI = -(*pSrc1++) * CoefA2;
+
+ /* (pSrc[2 * i + 1] + pSrc[2 * fftLen - 2 * i + 1]) * CoefA2 */
+ outR += (*pSrc1 + *pSrc2) * CoefA2;
+
+ /* pSrc[2 * i + 1] * CoefA1 */
+ outI += (*pSrc1++) * CoefA1;
+
+ CoefB1 = *pCoefB;
+
+ /* - pSrc[2 * fftLen - 2 * i + 1] * CoefB1 */
+ outI -= *pSrc2-- * CoefB1;
+
+ /* pSrc[2 * fftLen - 2 * i] * CoefB1 */
+ outR += *pSrc2 * CoefB1;
+
+ /* pSrc[2 * fftLen - 2 * i] * CoefA2 */
+ outI += *pSrc2-- * CoefA2;
+
+ /* write output */
+ *pDst++ = outR;
+ *pDst++ = outI;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (modifier * 2);
+ pCoefA = pCoefA + (modifier * 2 - 1);
+
+ /* Decrement loop count */
+ fftLen--;
+ }
+
+}
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c
new file mode 100644
index 0000000000..d75740d513
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f16.c
@@ -0,0 +1,612 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_f16.c
+ * Description: RFFT & RIFFT Floating point process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+
+#if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void stage_rfft_f16(
+ const arm_rfft_fast_instance_f16 * S,
+ float16_t * p,
+ float16_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float16_t twR, twI; /* RFFT Twiddle coefficients */
+ const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float16_t *pA = p; /* increasing pointer */
+ float16_t *pB = p; /* decreasing pointer */
+ float16_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float16_t t1a, t1b; /* temporary variables */
+ float16_t p0, p1, p2, p3; /* temporary variables */
+
+ float16x8x2_t tw,xA,xB;
+ float16x8x2_t tmp1, tmp2, res;
+
+ uint16x8_t vecStridesBkwd;
+
+ vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
+
+
+ int blockCnt;
+
+
+ k = (S->Sint).fftLen - 1;
+
+ /* Pack first and last sample of the frequency domain together */
+
+ xBR = pB[0];
+ xBI = pB[1];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++ ;
+ twI = *pCoeff++ ;
+
+ // U1 = XA(1) + XB(1); % It is real
+ t1a = xBR + xAR ;
+
+ // U2 = XB(1) - XA(1); % It is imaginary
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ *pOut++ = 0.5f * ( t1a + t1b );
+ *pOut++ = 0.5f * ( t1a - t1b );
+
+ // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
+ pB = p + 2*k - 14;
+ pA += 2;
+
+ blockCnt = k >> 3;
+ while (blockCnt > 0)
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+
+ xA = vld2q_f16(pA);
+ pA += 16;
+
+ xB = vld2q_f16(pB);
+
+ xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
+ xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
+
+ xB.val[1] = vnegq_f16(xB.val[1]);
+ pB -= 16;
+
+
+ tw = vld2q_f16(pCoeff);
+ pCoeff += 16;
+
+
+ tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
+ tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
+
+ tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
+ tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
+
+ res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+ res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+ res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+ res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+ res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
+ res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
+
+ res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
+ res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
+
+
+ vst2q_f16(pOut, res);
+ pOut += 16;
+
+
+ blockCnt--;
+ }
+
+ pB += 14;
+ blockCnt = k & 7;
+ while (blockCnt > 0)
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+ xBI = pB[1];
+ xBR = pB[0];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xBR - xAR ;
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ p0 = twR * t1a;
+ p1 = twI * t1a;
+ p2 = twR * t1b;
+ p3 = twI * t1b;
+
+ *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ blockCnt--;
+ }
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f16(
+ const arm_rfft_fast_instance_f16 * S,
+ float16_t * p,
+ float16_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float16_t twR, twI; /* RFFT Twiddle coefficients */
+ const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float16_t *pA = p; /* increasing pointer */
+ float16_t *pB = p; /* decreasing pointer */
+ float16_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float16_t t1a, t1b, r, s, t, u; /* temporary variables */
+
+ float16x8x2_t tw,xA,xB;
+ float16x8x2_t tmp1, tmp2, res;
+ uint16x8_t vecStridesBkwd;
+
+ vecStridesBkwd = vddupq_u16((uint16_t)14, 2);
+
+ int blockCnt;
+
+
+ k = (S->Sint).fftLen - 1;
+
+ xAR = pA[0];
+ xAI = pA[1];
+
+ pCoeff += 2 ;
+
+ *pOut++ = 0.5f * ( xAR + xAI );
+ *pOut++ = 0.5f * ( xAR - xAI );
+
+ pB = p + 2*k - 14;
+ pA += 2 ;
+
+ blockCnt = k >> 3;
+ while (blockCnt > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xA = vld2q_f16(pA);
+ pA += 16;
+
+ xB = vld2q_f16(pB);
+
+ xB.val[0] = vldrhq_gather_shifted_offset_f16(pB, vecStridesBkwd);
+ xB.val[1] = vldrhq_gather_shifted_offset_f16(&pB[1], vecStridesBkwd);
+
+ xB.val[1] = vnegq_f16(xB.val[1]);
+ pB -= 16;
+
+
+ tw = vld2q_f16(pCoeff);
+ tw.val[1] = vnegq_f16(tw.val[1]);
+ pCoeff += 16;
+
+
+ tmp1.val[0] = vaddq_f16(xA.val[0],xB.val[0]);
+ tmp1.val[1] = vaddq_f16(xA.val[1],xB.val[1]);
+
+ tmp2.val[0] = vsubq_f16(xB.val[0],xA.val[0]);
+ tmp2.val[1] = vsubq_f16(xB.val[1],xA.val[1]);
+
+ res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+ res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+ res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+ res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+ res.val[0] = vaddq_f16(res.val[0],tmp1.val[0] );
+ res.val[1] = vaddq_f16(res.val[1],tmp1.val[1] );
+
+ res.val[0] = vmulq_n_f16(res.val[0], 0.5f);
+ res.val[1] = vmulq_n_f16(res.val[1], 0.5f);
+
+
+ vst2q_f16(pOut, res);
+ pOut += 16;
+
+
+ blockCnt--;
+ }
+
+ pB += 14;
+ blockCnt = k & 7;
+ while (blockCnt > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xBI = pB[1] ;
+ xBR = pB[0] ;
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xAR - xBR ;
+ t1b = xAI + xBI ;
+
+ r = twR * t1a;
+ s = twI * t1b;
+ t = twI * t1a;
+ u = twR * t1b;
+
+ // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+ // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+ *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ blockCnt--;
+ }
+
+}
+#else
+void stage_rfft_f16(
+ const arm_rfft_fast_instance_f16 * S,
+ float16_t * p,
+ float16_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float16_t twR, twI; /* RFFT Twiddle coefficients */
+ const float16_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float16_t *pA = p; /* increasing pointer */
+ float16_t *pB = p; /* decreasing pointer */
+ float16_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float16_t t1a, t1b; /* temporary variables */
+ float16_t p0, p1, p2, p3; /* temporary variables */
+
+
+ k = (S->Sint).fftLen - 1;
+
+ /* Pack first and last sample of the frequency domain together */
+
+ xBR = pB[0];
+ xBI = pB[1];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++ ;
+ twI = *pCoeff++ ;
+
+
+ // U1 = XA(1) + XB(1); % It is real
+ t1a = xBR + xAR ;
+
+ // U2 = XB(1) - XA(1); % It is imaginary
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ *pOut++ = 0.5f * ( t1a + t1b );
+ *pOut++ = 0.5f * ( t1a - t1b );
+
+ // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
+ pB = p + 2*k;
+ pA += 2;
+
+ do
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+ xBI = pB[1];
+ xBR = pB[0];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xBR - xAR ;
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ p0 = twR * t1a;
+ p1 = twI * t1a;
+ p2 = twR * t1b;
+ p3 = twI * t1b;
+
+ *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ } while (k > 0);
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f16(
+ const arm_rfft_fast_instance_f16 * S,
+ float16_t * p,
+ float16_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float16_t twR, twI; /* RFFT Twiddle coefficients */
+ const float16_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float16_t *pA = p; /* increasing pointer */
+ float16_t *pB = p; /* decreasing pointer */
+ float16_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float16_t t1a, t1b, r, s, t, u; /* temporary variables */
+
+ k = (S->Sint).fftLen - 1;
+
+ xAR = pA[0];
+ xAI = pA[1];
+
+ pCoeff += 2 ;
+
+ *pOut++ = 0.5f * ( xAR + xAI );
+ *pOut++ = 0.5f * ( xAR - xAI );
+
+ pB = p + 2*k ;
+ pA += 2 ;
+
+ while (k > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xBI = pB[1] ;
+ xBR = pB[0] ;
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xAR - xBR ;
+ t1b = xAI + xBI ;
+
+ r = twR * t1a;
+ s = twI * t1b;
+ t = twI * t1a;
+ u = twR * t1b;
+
+ // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+ // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+ *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ }
+
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @ingroup groupTransforms
+*/
+
+/**
+ @defgroup RealFFT Real FFT Functions
+
+ @par
+ The CMSIS DSP library includes specialized algorithms for computing the
+ FFT of real data sequences. The FFT is defined over complex data but
+ in many applications the input is real. Real FFT algorithms take advantage
+ of the symmetry properties of the FFT and have a speed advantage over complex
+ algorithms of the same length.
+ @par
+ The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
+ @par
+ The real length N forward FFT of a sequence is computed using the steps shown below.
+ @par
+ \image html RFFT.gif "Real Fast Fourier Transform"
+ @par
+ The real sequence is initially treated as if it were complex to perform a CFFT.
+ Later, a processing stage reshapes the data to obtain half of the frequency spectrum
+ in complex format. Except the first complex number that contains the two real numbers
+ X[0] and X[N/2] all the data is complex. In other words, the first complex sample
+ contains two real values packed.
+ @par
+ The input for the inverse RFFT should keep the same format as the output of the
+ forward RFFT. A first processing stage pre-process the data to later perform an
+ inverse CFFT.
+ @par
+ \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+ @par
+ The algorithms for floating-point, Q15, and Q31 data are slightly different
+ and we describe each algorithm in turn.
+ @par Floating-point
+ The main functions are \ref arm_rfft_fast_f16() and \ref arm_rfft_fast_init_f16().
+
+ @par
+ The FFT of a real N-point sequence has even symmetry in the frequency domain.
+ The second half of the data equals the conjugate of the first half flipped in frequency.
+ Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
+ These are packed into the output array in alternating real and imaginary components:
+ @par
+ X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
+ real[(N/2)-1], imag[(N/2)-1 }
+ @par
+ It happens that the first complex number (real[0], imag[0]) is actually
+ all real. real[0] represents the DC offset, and imag[0] should be 0.
+ (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
+ the first harmonic and so on.
+ @par
+ The real FFT functions pack the frequency domain data in this fashion.
+ The forward transform outputs the data in this form and the inverse
+ transform expects input data in this form. The function always performs
+ the needed bitreversal so that the input and output data is always in
+ normal order. The functions support lengths of [32, 64, 128, ..., 4096]
+ samples.
+ @par Q15 and Q31
+ The real algorithms are defined in a similar manner and utilize N/2 complex
+ transforms behind the scenes.
+ @par
+ The complex transforms used internally include scaling to prevent fixed-point
+ overflows. The overall scaling equals 1/(fftLen/2).
+ Due to the use of complex transform internally, the source buffer is
+ modified by the rfft.
+ @par
+ A separate instance structure must be defined for each transform used but
+ twiddle factor and bit reversal tables can be reused.
+ @par
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Initializes twiddle factor table and bit reversal table pointers.
+ - Initializes the internal complex FFT data structure.
+ @par
+ Use of the initialization function is optional **except for MVE versions where it is mandatory**.
+ If you don't use the initialization functions, then the structures should be initialized with code
+ similar to the one below:
+
+ arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+ arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+
+ where fftLenReal is the length of the real transform;
+ fftLenBy2 length of the internal complex transform (fftLenReal/2).
+ ifftFlagR Selects forward (=0) or inverse (=1) transform.
+ bitReverseFlagR Selects bit reversed output (=0) or normal order
+ output (=1).
+ twidCoefRModifier stride modifier for the twiddle factor table.
+ The value is based on the FFT length;
+ pTwiddleARealpoints to the A array of twiddle coefficients;
+ pTwiddleBRealpoints to the B array of twiddle coefficients;
+ pCfft points to the CFFT Instance structure. The CFFT structure
+ must also be initialized.
+@par
+ Note that with MVE versions you can't initialize instance structures directly and **must
+ use the initialization function**.
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+*/
+
+/**
+ @brief Processing function for the floating-point real FFT.
+ @param[in] S points to an arm_rfft_fast_instance_f16 structure
+ @param[in] p points to input buffer (Source buffer is modified by this function.)
+ @param[in] pOut points to output buffer
+ @param[in] ifftFlag
+ - value = 0: RFFT
+ - value = 1: RIFFT
+ @return none
+*/
+
+void arm_rfft_fast_f16(
+ const arm_rfft_fast_instance_f16 * S,
+ float16_t * p,
+ float16_t * pOut,
+ uint8_t ifftFlag)
+{
+ const arm_cfft_instance_f16 * Sint = &(S->Sint);
+
+
+ /* Calculation of Real FFT */
+ if (ifftFlag)
+ {
+ /* Real FFT compression */
+ merge_rfft_f16(S, p, pOut);
+ /* Complex radix-4 IFFT process */
+ arm_cfft_f16( Sint, pOut, ifftFlag, 1);
+ }
+ else
+ {
+
+ /* Calculation of RFFT of input */
+ arm_cfft_f16( Sint, p, ifftFlag, 1);
+
+ /* Real FFT extraction */
+ stage_rfft_f16(S, p, pOut);
+ }
+}
+
+/**
+* @} end of RealFFT group
+*/
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
new file mode 100644
index 0000000000..a2a2f999b5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f32.c
@@ -0,0 +1,603 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_f32.c
+ * Description: RFFT & RIFFT Floating point process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+#if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
+void stage_rfft_f32(
+ const arm_rfft_fast_instance_f32 * S,
+ float32_t * p,
+ float32_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float32_t twR, twI; /* RFFT Twiddle coefficients */
+ const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float32_t *pA = p; /* increasing pointer */
+ float32_t *pB = p; /* decreasing pointer */
+ float32_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float32_t t1a, t1b; /* temporary variables */
+ float32_t p0, p1, p2, p3; /* temporary variables */
+
+ float32x4x2_t tw,xA,xB;
+ float32x4x2_t tmp1, tmp2, res;
+
+ uint32x4_t vecStridesFwd, vecStridesBkwd;
+
+ vecStridesFwd = vidupq_u32((uint32_t)0, 2);
+ vecStridesBkwd = -vecStridesFwd;
+
+ int blockCnt;
+
+
+ k = (S->Sint).fftLen - 1;
+
+ /* Pack first and last sample of the frequency domain together */
+
+ xBR = pB[0];
+ xBI = pB[1];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++ ;
+ twI = *pCoeff++ ;
+
+ // U1 = XA(1) + XB(1); % It is real
+ t1a = xBR + xAR ;
+
+ // U2 = XB(1) - XA(1); % It is imaginary
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ *pOut++ = 0.5f * ( t1a + t1b );
+ *pOut++ = 0.5f * ( t1a - t1b );
+
+ // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
+ pB = p + 2*k;
+ pA += 2;
+
+ blockCnt = k >> 2;
+ while (blockCnt > 0)
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+
+ xA = vld2q_f32(pA);
+ pA += 8;
+
+ xB = vld2q_f32(pB);
+
+ xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
+ xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
+
+ xB.val[1] = vnegq_f32(xB.val[1]);
+ pB -= 8;
+
+
+ tw = vld2q_f32(pCoeff);
+ pCoeff += 8;
+
+
+ tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
+ tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
+
+ tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
+ tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
+
+ res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+ res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+ res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+ res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+ res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
+ res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
+
+ res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
+ res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
+
+
+ vst2q_f32(pOut, res);
+ pOut += 8;
+
+
+ blockCnt--;
+ }
+
+ blockCnt = k & 3;
+ while (blockCnt > 0)
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+ xBI = pB[1];
+ xBR = pB[0];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xBR - xAR ;
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ p0 = twR * t1a;
+ p1 = twI * t1a;
+ p2 = twR * t1b;
+ p3 = twI * t1b;
+
+ *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ blockCnt--;
+ }
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f32(
+ const arm_rfft_fast_instance_f32 * S,
+ float32_t * p,
+ float32_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float32_t twR, twI; /* RFFT Twiddle coefficients */
+ const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float32_t *pA = p; /* increasing pointer */
+ float32_t *pB = p; /* decreasing pointer */
+ float32_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float32_t t1a, t1b, r, s, t, u; /* temporary variables */
+
+ float32x4x2_t tw,xA,xB;
+ float32x4x2_t tmp1, tmp2, res;
+ uint32x4_t vecStridesFwd, vecStridesBkwd;
+
+ vecStridesFwd = vidupq_u32((uint32_t)0, 2);
+ vecStridesBkwd = -vecStridesFwd;
+
+ int blockCnt;
+
+
+ k = (S->Sint).fftLen - 1;
+
+ xAR = pA[0];
+ xAI = pA[1];
+
+ pCoeff += 2 ;
+
+ *pOut++ = 0.5f * ( xAR + xAI );
+ *pOut++ = 0.5f * ( xAR - xAI );
+
+ pB = p + 2*k ;
+ pA += 2 ;
+
+ blockCnt = k >> 2;
+ while (blockCnt > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xA = vld2q_f32(pA);
+ pA += 8;
+
+ xB = vld2q_f32(pB);
+
+ xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd);
+ xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd);
+
+ xB.val[1] = vnegq_f32(xB.val[1]);
+ pB -= 8;
+
+
+ tw = vld2q_f32(pCoeff);
+ tw.val[1] = vnegq_f32(tw.val[1]);
+ pCoeff += 8;
+
+
+ tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]);
+ tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]);
+
+ tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]);
+ tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]);
+
+ res.val[0] = vmulq(tw.val[0], tmp2.val[0]);
+ res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]);
+
+ res.val[1] = vmulq(tw.val[0], tmp2.val[1]);
+ res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]);
+
+ res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] );
+ res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] );
+
+ res.val[0] = vmulq_n_f32(res.val[0], 0.5f);
+ res.val[1] = vmulq_n_f32(res.val[1], 0.5f);
+
+
+ vst2q_f32(pOut, res);
+ pOut += 8;
+
+
+ blockCnt--;
+ }
+
+ blockCnt = k & 3;
+ while (blockCnt > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xBI = pB[1] ;
+ xBR = pB[0] ;
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xAR - xBR ;
+ t1b = xAI + xBI ;
+
+ r = twR * t1a;
+ s = twI * t1b;
+ t = twI * t1a;
+ u = twR * t1b;
+
+ // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+ // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+ *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ blockCnt--;
+ }
+
+}
+#else
+void stage_rfft_f32(
+ const arm_rfft_fast_instance_f32 * S,
+ float32_t * p,
+ float32_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float32_t twR, twI; /* RFFT Twiddle coefficients */
+ const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float32_t *pA = p; /* increasing pointer */
+ float32_t *pB = p; /* decreasing pointer */
+ float32_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float32_t t1a, t1b; /* temporary variables */
+ float32_t p0, p1, p2, p3; /* temporary variables */
+
+
+ k = (S->Sint).fftLen - 1;
+
+ /* Pack first and last sample of the frequency domain together */
+
+ xBR = pB[0];
+ xBI = pB[1];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++ ;
+ twI = *pCoeff++ ;
+
+
+ // U1 = XA(1) + XB(1); % It is real
+ t1a = xBR + xAR ;
+
+ // U2 = XB(1) - XA(1); % It is imaginary
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ *pOut++ = 0.5f * ( t1a + t1b );
+ *pOut++ = 0.5f * ( t1a - t1b );
+
+ // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
+ pB = p + 2*k;
+ pA += 2;
+
+ do
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+ xBI = pB[1];
+ xBR = pB[0];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xBR - xAR ;
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ p0 = twR * t1a;
+ p1 = twI * t1a;
+ p2 = twR * t1b;
+ p3 = twI * t1b;
+
+ *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI
+
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ } while (k > 0);
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f32(
+ const arm_rfft_fast_instance_f32 * S,
+ float32_t * p,
+ float32_t * pOut)
+{
+ int32_t k; /* Loop Counter */
+ float32_t twR, twI; /* RFFT Twiddle coefficients */
+ const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float32_t *pA = p; /* increasing pointer */
+ float32_t *pB = p; /* decreasing pointer */
+ float32_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float32_t t1a, t1b, r, s, t, u; /* temporary variables */
+
+ k = (S->Sint).fftLen - 1;
+
+ xAR = pA[0];
+ xAI = pA[1];
+
+ pCoeff += 2 ;
+
+ *pOut++ = 0.5f * ( xAR + xAI );
+ *pOut++ = 0.5f * ( xAR - xAI );
+
+ pB = p + 2*k ;
+ pA += 2 ;
+
+ while (k > 0)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xBI = pB[1] ;
+ xBR = pB[0] ;
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xAR - xBR ;
+ t1b = xAI + xBI ;
+
+ r = twR * t1a;
+ s = twI * t1b;
+ t = twI * t1a;
+ u = twR * t1b;
+
+ // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+ // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+ *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR
+ *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ }
+
+}
+
+#endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
+
+/**
+ @ingroup groupTransforms
+*/
+
+/**
+ @defgroup RealFFT Real FFT Functions
+
+ @par
+ The CMSIS DSP library includes specialized algorithms for computing the
+ FFT of real data sequences. The FFT is defined over complex data but
+ in many applications the input is real. Real FFT algorithms take advantage
+ of the symmetry properties of the FFT and have a speed advantage over complex
+ algorithms of the same length.
+ @par
+ The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage.
+ @par
+ The real length N forward FFT of a sequence is computed using the steps shown below.
+ @par
+ \image html RFFT.gif "Real Fast Fourier Transform"
+ @par
+ The real sequence is initially treated as if it were complex to perform a CFFT.
+ Later, a processing stage reshapes the data to obtain half of the frequency spectrum
+ in complex format. Except the first complex number that contains the two real numbers
+ X[0] and X[N/2] all the data is complex. In other words, the first complex sample
+ contains two real values packed.
+ @par
+ The input for the inverse RFFT should keep the same format as the output of the
+ forward RFFT. A first processing stage pre-process the data to later perform an
+ inverse CFFT.
+ @par
+ \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+ @par
+ The algorithms for floating-point, Q15, and Q31 data are slightly different
+ and we describe each algorithm in turn.
+ @par Floating-point
+ The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32().
+ The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated
+ but are still documented.
+ @par
+ The FFT of a real N-point sequence has even symmetry in the frequency domain.
+ The second half of the data equals the conjugate of the first half flipped in frequency.
+ Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers.
+ These are packed into the output array in alternating real and imaginary components:
+ @par
+ X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ...
+ real[(N/2)-1], imag[(N/2)-1 }
+ @par
+ It happens that the first complex number (real[0], imag[0]) is actually
+ all real. real[0] represents the DC offset, and imag[0] should be 0.
+ (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is
+ the first harmonic and so on.
+ @par
+ The real FFT functions pack the frequency domain data in this fashion.
+ The forward transform outputs the data in this form and the inverse
+ transform expects input data in this form. The function always performs
+ the needed bitreversal so that the input and output data is always in
+ normal order. The functions support lengths of [32, 64, 128, ..., 4096]
+ samples.
+ @par Q15 and Q31
+ The real algorithms are defined in a similar manner and utilize N/2 complex
+ transforms behind the scenes.
+ @par
+ The complex transforms used internally include scaling to prevent fixed-point
+ overflows. The overall scaling equals 1/(fftLen/2).
+ Due to the use of complex transform internally, the source buffer is
+ modified by the rfft.
+ @par
+ A separate instance structure must be defined for each transform used but
+ twiddle factor and bit reversal tables can be reused.
+ @par
+ There is also an associated initialization function for each data type.
+ The initialization function performs the following operations:
+ - Sets the values of the internal structure fields.
+ - Initializes twiddle factor table and bit reversal table pointers.
+ - Initializes the internal complex FFT data structure.
+ @par
+ Use of the initialization function is optional **except for MVE versions where it is mandatory**.
+ If you don't use the initialization functions, then the structures should be initialized with code
+ similar to the one below:
+
+ arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+ arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft};
+
+ where fftLenReal is the length of the real transform;
+ fftLenBy2 length of the internal complex transform (fftLenReal/2).
+ ifftFlagR Selects forward (=0) or inverse (=1) transform.
+ bitReverseFlagR Selects bit reversed output (=0) or normal order
+ output (=1).
+ twidCoefRModifier stride modifier for the twiddle factor table.
+ The value is based on the FFT length;
+ pTwiddleARealpoints to the A array of twiddle coefficients;
+ pTwiddleBRealpoints to the B array of twiddle coefficients;
+ pCfft points to the CFFT Instance structure. The CFFT structure
+ must also be initialized.
+@par
+ Note that with MVE versions you can't initialize instance structures directly and **must
+ use the initialization function**.
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+*/
+
+/**
+ @brief Processing function for the floating-point real FFT.
+ @param[in] S points to an arm_rfft_fast_instance_f32 structure
+ @param[in] p points to input buffer (Source buffer is modified by this function.)
+ @param[in] pOut points to output buffer
+ @param[in] ifftFlag
+ - value = 0: RFFT
+ - value = 1: RIFFT
+ @return none
+*/
+
+void arm_rfft_fast_f32(
+ const arm_rfft_fast_instance_f32 * S,
+ float32_t * p,
+ float32_t * pOut,
+ uint8_t ifftFlag)
+{
+ const arm_cfft_instance_f32 * Sint = &(S->Sint);
+
+ /* Calculation of Real FFT */
+ if (ifftFlag)
+ {
+ /* Real FFT compression */
+ merge_rfft_f32(S, p, pOut);
+ /* Complex radix-4 IFFT process */
+ arm_cfft_f32( Sint, pOut, ifftFlag, 1);
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
+ arm_cfft_f32( Sint, p, ifftFlag, 1);
+
+ /* Real FFT extraction */
+ stage_rfft_f32(S, p, pOut);
+ }
+}
+
+/**
+* @} end of RealFFT group
+*/
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c
new file mode 100644
index 0000000000..85ee5ad311
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_f64.c
@@ -0,0 +1,228 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_f64.c
+ * Description: RFFT & RIFFT Double precision Floating point process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+void stage_rfft_f64(
+ const arm_rfft_fast_instance_f64 * S,
+ float64_t * p,
+ float64_t * pOut)
+{
+ uint32_t k; /* Loop Counter */
+ float64_t twR, twI; /* RFFT Twiddle coefficients */
+ const float64_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float64_t *pA = p; /* increasing pointer */
+ float64_t *pB = p; /* decreasing pointer */
+ float64_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float64_t t1a, t1b; /* temporary variables */
+ float64_t p0, p1, p2, p3; /* temporary variables */
+
+
+ k = (S->Sint).fftLen - 1;
+
+ /* Pack first and last sample of the frequency domain together */
+
+ xBR = pB[0];
+ xBI = pB[1];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++ ;
+ twI = *pCoeff++ ;
+
+ // U1 = XA(1) + XB(1); % It is real
+ t1a = xBR + xAR ;
+
+ // U2 = XB(1) - XA(1); % It is imaginary
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ *pOut++ = 0.5 * ( t1a + t1b );
+ *pOut++ = 0.5 * ( t1a - t1b );
+
+ // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) ));
+ pB = p + 2*k;
+ pA += 2;
+
+ do
+ {
+ /*
+ function X = my_split_rfft(X, ifftFlag)
+ % X is a series of real numbers
+ L = length(X);
+ XC = X(1:2:end) +i*X(2:2:end);
+ XA = fft(XC);
+ XB = conj(XA([1 end:-1:2]));
+ TW = i*exp(-2*pi*i*[0:L/2-1]/L).';
+ for l = 2:L/2
+ XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l)));
+ end
+ XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1))));
+ X = XA;
+ */
+
+ xBI = pB[1];
+ xBR = pB[0];
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xBR - xAR ;
+ t1b = xBI + xAI ;
+
+ // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI);
+ // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI);
+ p0 = twR * t1a;
+ p1 = twI * t1a;
+ p2 = twR * t1b;
+ p3 = twI * t1b;
+
+ *pOut++ = 0.5 * (xAR + xBR + p0 + p3 ); //xAR
+ *pOut++ = 0.5 * (xAI - xBI + p1 - p2 ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ } while (k > 0U);
+}
+
+/* Prepares data for inverse cfft */
+void merge_rfft_f64(
+ const arm_rfft_fast_instance_f64 * S,
+ float64_t * p,
+ float64_t * pOut)
+{
+ uint32_t k; /* Loop Counter */
+ float64_t twR, twI; /* RFFT Twiddle coefficients */
+ const float64_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */
+ float64_t *pA = p; /* increasing pointer */
+ float64_t *pB = p; /* decreasing pointer */
+ float64_t xAR, xAI, xBR, xBI; /* temporary variables */
+ float64_t t1a, t1b, r, s, t, u; /* temporary variables */
+
+ k = (S->Sint).fftLen - 1;
+
+ xAR = pA[0];
+ xAI = pA[1];
+
+ pCoeff += 2 ;
+
+ *pOut++ = 0.5 * ( xAR + xAI );
+ *pOut++ = 0.5 * ( xAR - xAI );
+
+ pB = p + 2*k ;
+ pA += 2 ;
+
+ while (k > 0U)
+ {
+ /* G is half of the frequency complex spectrum */
+ //for k = 2:N
+ // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2))));
+ xBI = pB[1] ;
+ xBR = pB[0] ;
+ xAR = pA[0];
+ xAI = pA[1];
+
+ twR = *pCoeff++;
+ twI = *pCoeff++;
+
+ t1a = xAR - xBR ;
+ t1b = xAI + xBI ;
+
+ r = twR * t1a;
+ s = twI * t1b;
+ t = twI * t1a;
+ u = twR * t1b;
+
+ // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI);
+ // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI);
+ *pOut++ = 0.5 * (xAR + xBR - r - s ); //xAR
+ *pOut++ = 0.5 * (xAI - xBI + t - u ); //xAI
+
+ pA += 2;
+ pB -= 2;
+ k--;
+ }
+
+}
+
+/**
+ @ingroup groupTransforms
+*/
+
+
+/**
+ @addtogroup RealFFT
+ @{
+*/
+
+/**
+ @brief Processing function for the Double Precision floating-point real FFT.
+ @param[in] S points to an arm_rfft_fast_instance_f64 structure
+ @param[in] p points to input buffer (Source buffer is modified by this function.)
+ @param[in] pOut points to output buffer
+ @param[in] ifftFlag
+ - value = 0: RFFT
+ - value = 1: RIFFT
+ @return none
+*/
+
+void arm_rfft_fast_f64(
+ arm_rfft_fast_instance_f64 * S,
+ float64_t * p,
+ float64_t * pOut,
+ uint8_t ifftFlag)
+{
+ arm_cfft_instance_f64 * Sint = &(S->Sint);
+ Sint->fftLen = S->fftLenRFFT / 2;
+
+ /* Calculation of Real FFT */
+ if (ifftFlag)
+ {
+ /* Real FFT compression */
+ merge_rfft_f64(S, p, pOut);
+
+ /* Complex radix-4 IFFT process */
+ arm_cfft_f64( Sint, pOut, ifftFlag, 1);
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
+ arm_cfft_f64( Sint, p, ifftFlag, 1);
+
+ /* Real FFT extraction */
+ stage_rfft_f64(S, p, pOut);
+ }
+}
+
+/**
+* @} end of RealFFT group
+*/
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c
new file mode 100644
index 0000000000..f15d05e712
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f16.c
@@ -0,0 +1,357 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_init_f16.c
+ * Description: Split Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions_f16.h"
+#include "arm_common_tables_f16.h"
+#include "arm_const_structs_f16.h"
+
+#if defined(ARM_FLOAT16_SUPPORTED)
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32))
+
+/**
+ @private
+ @brief Initialization function for the 32pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_32_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),16);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+
+ S->fftLenRFFT = 32U;
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_32;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64))
+
+/**
+ @private
+ @brief Initialization function for the 64pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_64_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),32);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 64U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_64;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128))
+
+/**
+ @private
+ @brief Initialization function for the 128pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_128_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),64);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 128;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_128;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256))
+
+/**
+ @private
+ @brief Initialization function for the 256pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+*/
+
+static arm_status arm_rfft_256_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),128);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 256U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_256;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512))
+
+/**
+ @private
+ @brief Initialization function for the 512pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_512_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),256);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 512U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_512;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024))
+/**
+ @private
+ @brief Initialization function for the 1024pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_1024_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),512);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 1024U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_1024;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048))
+/**
+ @private
+ @brief Initialization function for the 2048pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+static arm_status arm_rfft_2048_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),1024);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 2048U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_2048;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096))
+/**
+ @private
+* @brief Initialization function for the 4096pt floating-point real FFT.
+* @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_4096_fast_init_f16( arm_rfft_fast_instance_f16 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f16(&(S->Sint),2048);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 4096U;
+
+ S->pTwiddleRFFT = (float16_t *) twiddleCoefF16_rfft_4096;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+/**
+ @brief Initialization function for the floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f16 structure
+ @param[in] fftLen length of the Real Sequence
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Description
+ The parameter fftLen specifies the length of RFFT/CIFFT process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_rfft_fast_init_f16(
+ arm_rfft_fast_instance_f16 * S,
+ uint16_t fftLen)
+{
+ typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f16 *);
+ fft_init_ptr fptr = 0x0;
+
+ switch (fftLen)
+ {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096))
+ case 4096U:
+ fptr = arm_rfft_4096_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048))
+ case 2048U:
+ fptr = arm_rfft_2048_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024))
+ case 1024U:
+ fptr = arm_rfft_1024_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_512))
+ case 512U:
+ fptr = arm_rfft_512_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_256))
+ case 256U:
+ fptr = arm_rfft_256_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_128))
+ case 128U:
+ fptr = arm_rfft_128_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_64))
+ case 64U:
+ fptr = arm_rfft_64_fast_init_f16;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F16_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F16_32))
+ case 32U:
+ fptr = arm_rfft_32_fast_init_f16;
+ break;
+#endif
+ default:
+ return ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
+ return fptr( S );
+
+}
+
+/**
+ @} end of RealFFT group
+ */
+
+#endif /* #if defined(ARM_FLOAT16_SUPPORTED) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
new file mode 100644
index 0000000000..03443daeaa
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f32.c
@@ -0,0 +1,352 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_init_f32.c
+ * Description: Split Radix Decimation in Frequency CFFT Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+
+/**
+ @private
+ @brief Initialization function for the 32pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_32_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),16);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+
+ S->fftLenRFFT = 32U;
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_32;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+
+/**
+ @private
+ @brief Initialization function for the 64pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_64_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),32);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 64U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_64;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+
+/**
+ @private
+ @brief Initialization function for the 128pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_128_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),64);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 128;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_128;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+
+/**
+ @private
+ @brief Initialization function for the 256pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+*/
+
+static arm_status arm_rfft_256_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),128);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 256U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_256;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+
+/**
+ @private
+ @brief Initialization function for the 512pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_512_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),256);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 512U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_512;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+/**
+ @private
+ @brief Initialization function for the 1024pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_1024_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),512);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 1024U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_1024;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+/**
+ @private
+ @brief Initialization function for the 2048pt floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+static arm_status arm_rfft_2048_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),1024);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 2048U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_2048;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+/**
+ @private
+* @brief Initialization function for the 4096pt floating-point real FFT.
+* @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_4096_fast_init_f32( arm_rfft_fast_instance_f32 * S ) {
+
+ arm_status status;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ status=arm_cfft_init_f32(&(S->Sint),2048);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ S->fftLenRFFT = 4096U;
+
+ S->pTwiddleRFFT = (float32_t *) twiddleCoef_rfft_4096;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+/**
+ @brief Initialization function for the floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f32 structure
+ @param[in] fftLen length of the Real Sequence
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Description
+ The parameter fftLen specifies the length of RFFT/CIFFT process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_rfft_fast_init_f32(
+ arm_rfft_fast_instance_f32 * S,
+ uint16_t fftLen)
+{
+ typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f32 *);
+ fft_init_ptr fptr = 0x0;
+
+ switch (fftLen)
+ {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_2048) && defined(ARM_TABLE_BITREVIDX_FLT_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096))
+ case 4096U:
+ fptr = arm_rfft_4096_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_1024) && defined(ARM_TABLE_BITREVIDX_FLT_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048))
+ case 2048U:
+ fptr = arm_rfft_2048_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_512) && defined(ARM_TABLE_BITREVIDX_FLT_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024))
+ case 1024U:
+ fptr = arm_rfft_1024_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_256) && defined(ARM_TABLE_BITREVIDX_FLT_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_512))
+ case 512U:
+ fptr = arm_rfft_512_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_128) && defined(ARM_TABLE_BITREVIDX_FLT_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_256))
+ case 256U:
+ fptr = arm_rfft_256_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_64) && defined(ARM_TABLE_BITREVIDX_FLT_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_128))
+ case 128U:
+ fptr = arm_rfft_128_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_32) && defined(ARM_TABLE_BITREVIDX_FLT_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_64))
+ case 64U:
+ fptr = arm_rfft_64_fast_init_f32;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F32_16) && defined(ARM_TABLE_BITREVIDX_FLT_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F32_32))
+ case 32U:
+ fptr = arm_rfft_32_fast_init_f32;
+ break;
+#endif
+ default:
+ return ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
+ return fptr( S );
+
+}
+
+/**
+ @} end of RealFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c
new file mode 100644
index 0000000000..97c4fb31e2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_fast_init_f64.c
@@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_fast_init_f64.c
+ * Description: Split Radix Decimation in Frequency CFFT Double Precision Floating point processing function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+/**
+ @ingroup groupTransforms
+ */
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16) && defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
+
+/**
+ @brief Initialization function for the 32pt double precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_32_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 16U;
+ S->fftLenRFFT = 32U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_16_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_16;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_16;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_32;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
+
+/**
+ @brief Initialization function for the 64pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_64_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 32U;
+ S->fftLenRFFT = 64U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_32_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_32;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_32;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_64;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
+
+/**
+ @brief Initialization function for the 128pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_128_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 64U;
+ S->fftLenRFFT = 128U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_64_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_64;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_64;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_128;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
+
+/**
+ @brief Initialization function for the 256pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+*/
+
+static arm_status arm_rfft_256_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 128U;
+ S->fftLenRFFT = 256U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_128_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_128;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_128;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_256;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
+
+/**
+ @brief Initialization function for the 512pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_512_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 256U;
+ S->fftLenRFFT = 512U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_256_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_256;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_256;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_512;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
+/**
+ @brief Initialization function for the 1024pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_1024_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 512U;
+ S->fftLenRFFT = 1024U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_512_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_512;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_512;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_1024;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
+/**
+ @brief Initialization function for the 2048pt Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+static arm_status arm_rfft_2048_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 1024U;
+ S->fftLenRFFT = 2048U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_1024_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_1024;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_1024;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_2048;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
+/**
+* @brief Initialization function for the 4096pt Double Precision floating-point real FFT.
+* @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : an error is detected
+ */
+
+static arm_status arm_rfft_4096_fast_init_f64( arm_rfft_fast_instance_f64 * S ) {
+
+ arm_cfft_instance_f64 * Sint;
+
+ if( !S ) return ARM_MATH_ARGUMENT_ERROR;
+
+ Sint = &(S->Sint);
+ Sint->fftLen = 2048U;
+ S->fftLenRFFT = 4096U;
+
+ Sint->bitRevLength = ARMBITREVINDEXTABLEF64_2048_TABLE_LENGTH;
+ Sint->pBitRevTable = (uint16_t *)armBitRevIndexTableF64_2048;
+ Sint->pTwiddle = (float64_t *) twiddleCoefF64_2048;
+ S->pTwiddleRFFT = (float64_t *) twiddleCoefF64_rfft_4096;
+
+ return ARM_MATH_SUCCESS;
+}
+#endif
+
+/**
+ @brief Initialization function for the Double Precision floating-point real FFT.
+ @param[in,out] S points to an arm_rfft_fast_instance_f64 structure
+ @param[in] fftLen length of the Real Sequence
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLen is not a supported length
+
+ @par Description
+ The parameter fftLen specifies the length of RFFT/CIFFT process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096.
+ @par
+ This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+ */
+
+arm_status arm_rfft_fast_init_f64(
+ arm_rfft_fast_instance_f64 * S,
+ uint16_t fftLen)
+{
+ typedef arm_status(*fft_init_ptr)( arm_rfft_fast_instance_f64 *);
+ fft_init_ptr fptr = 0x0;
+
+ switch (fftLen)
+ {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_BITREVIDX_FLT64_2048) && defined(ARM_TABLE_TWIDDLECOEF_F64_2048) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096))
+ case 4096U:
+ fptr = arm_rfft_4096_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_BITREVIDX_FLT64_1024) && defined(ARM_TABLE_TWIDDLECOEF_F64_1024) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048))
+ case 2048U:
+ fptr = arm_rfft_2048_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_BITREVIDX_FLT64_512) && defined(ARM_TABLE_TWIDDLECOEF_F64_512) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024))
+ case 1024U:
+ fptr = arm_rfft_1024_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_BITREVIDX_FLT64_256) && defined(ARM_TABLE_TWIDDLECOEF_F64_256) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_512))
+ case 512U:
+ fptr = arm_rfft_512_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_BITREVIDX_FLT64_128) && defined(ARM_TABLE_TWIDDLECOEF_F64_128) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_256))
+ case 256U:
+ fptr = arm_rfft_256_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_BITREVIDX_FLT64_64) && defined(ARM_TABLE_TWIDDLECOEF_F64_64) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_128))
+ case 128U:
+ fptr = arm_rfft_128_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_BITREVIDX_FLT64_32) && defined(ARM_TABLE_TWIDDLECOEF_F64_32) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_64))
+ case 64U:
+ fptr = arm_rfft_64_fast_init_f64;
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_BITREVIDX_FLT64_16) && defined(ARM_TABLE_TWIDDLECOEF_F64_16) && defined(ARM_TABLE_TWIDDLECOEF_RFFT_F64_32))
+ case 32U:
+ fptr = arm_rfft_32_fast_init_f64;
+ break;
+#endif
+ default:
+ return ARM_MATH_ARGUMENT_ERROR;
+ }
+
+ if( ! fptr ) return ARM_MATH_ARGUMENT_ERROR;
+ return fptr( S );
+
+}
+
+/**
+ @} end of RealFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
new file mode 100644
index 0000000000..0a32da6617
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_f32.c
@@ -0,0 +1,147 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_init_f32.c
+ * Description: RFFT & RIFFT Floating point initialisation function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the floating-point RFFT/RIFFT.
+ @deprecated Do not use this function. It has been superceded by \ref arm_rfft_fast_init_f32 and will be removed in the future.
+ @param[in,out] S points to an instance of the floating-point RFFT/RIFFT structure
+ @param[in,out] S_CFFT points to an instance of the floating-point CFFT/CIFFT structure
+ @param[in] fftLenReal length of the FFT.
+ @param[in] ifftFlagR flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLenReal is not a supported length
+
+ @par Description
+ The parameter fftLenRealspecifies length of RFFT/RIFFT Process.
+ Supported FFT Lengths are 128, 512, 2048.
+ @par
+ The parameter ifftFlagR controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ This function also initializes Twiddle factor table.
+ */
+
+arm_status arm_rfft_init_f32(
+ arm_rfft_instance_f32 * S,
+ arm_cfft_radix4_instance_f32 * S_CFFT,
+ uint32_t fftLenReal,
+ uint32_t ifftFlagR,
+ uint32_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_F32)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialize the Real FFT length */
+ S->fftLenReal = (uint16_t) fftLenReal;
+
+ /* Initialize the Complex FFT length */
+ S->fftLenBy2 = (uint16_t) fftLenReal / 2U;
+
+ /* Initialize the Twiddle coefficientA pointer */
+ S->pTwiddleAReal = (float32_t *) realCoefA;
+
+ /* Initialize the Twiddle coefficientB pointer */
+ S->pTwiddleBReal = (float32_t *) realCoefB;
+
+ /* Initialize the Flag for selection of RFFT or RIFFT */
+ S->ifftFlagR = (uint8_t) ifftFlagR;
+
+ /* Initialize the Flag for calculation Bit reversal or not */
+ S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLenReal)
+ {
+ /* Init table modifier value */
+ case 8192U:
+ S->twidCoefRModifier = 1U;
+ break;
+ case 2048U:
+ S->twidCoefRModifier = 4U;
+ break;
+ case 512U:
+ S->twidCoefRModifier = 16U;
+ break;
+ case 128U:
+ S->twidCoefRModifier = 64U;
+ break;
+ default:
+ /* Reporting argument error if rfftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+ /* Init Complex FFT Instance */
+ S->pCfft = S_CFFT;
+
+ if (S->ifftFlagR)
+ {
+ /* Initializes the CIFFT Module for fftLenreal/2 length */
+ arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 1U, 0U);
+ }
+ else
+ {
+ /* Initializes the CFFT Module for fftLenreal/2 length */
+ arm_cfft_radix4_init_f32(S->pCfft, S->fftLenBy2, 0U, 0U);
+ }
+
+#endif
+#endif
+ /* return the status of RFFT Init function */
+ return (status);
+
+}
+
+/**
+ @} end of RealFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
new file mode 100644
index 0000000000..2aff274589
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q15.c
@@ -0,0 +1,248 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_init_q15.c
+ * Description: RFFT & RIFFT Q15 initialisation function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q15 RFFT/RIFFT.
+ @param[in,out] S points to an instance of the Q15 RFFT/RIFFT structure
+ @param[in] fftLenReal length of the FFT
+ @param[in] ifftFlagR flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLenReal is not a supported length
+
+ @par Details
+ The parameter fftLenReal specifies length of RFFT/RIFFT Process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
+ @par
+ The parameter ifftFlagR controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ This function also initializes Twiddle factor table.
+ */
+
+arm_status arm_rfft_init_q15(
+ arm_rfft_instance_q15 * S,
+ uint32_t fftLenReal,
+ uint32_t ifftFlagR,
+ uint32_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q15)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialize the Real FFT length */
+ S->fftLenReal = (uint16_t) fftLenReal;
+
+ /* Initialize the Twiddle coefficientA pointer */
+ S->pTwiddleAReal = (q15_t *) realCoefAQ15;
+
+ /* Initialize the Twiddle coefficientB pointer */
+ S->pTwiddleBReal = (q15_t *) realCoefBQ15;
+
+ /* Initialize the Flag for selection of RFFT or RIFFT */
+ S->ifftFlagR = (uint8_t) ifftFlagR;
+
+ /* Initialize the Flag for calculation Bit reversal or not */
+ S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+ /* Initialization of coef modifier depending on the FFT length */
+ switch (S->fftLenReal)
+ {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+ case 8192U:
+ S->twidCoefRModifier = 1U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),4096);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len4096;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+ case 4096U:
+ S->twidCoefRModifier = 2U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),2048);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len2048;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+ case 2048U:
+ S->twidCoefRModifier = 4U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),1024);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len1024;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+ case 1024U:
+ S->twidCoefRModifier = 8U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),512);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len512;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+ case 512U:
+ S->twidCoefRModifier = 16U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),256);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len256;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+ case 256U:
+ S->twidCoefRModifier = 32U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),128);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len128;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+ case 128U:
+ S->twidCoefRModifier = 64U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),64);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len64;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+ case 64U:
+ S->twidCoefRModifier = 128U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),32);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len32;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q15_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+ case 32U:
+ S->twidCoefRModifier = 256U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q15(&(S->cfftInst),16);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q15_len16;
+ #endif
+ break;
+#endif
+ default:
+ /* Reporting argument error if rfftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+ /* return the status of RFFT Init function */
+ return (status);
+}
+
+/**
+ @} end of RealFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
new file mode 100644
index 0000000000..a868121a0f
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_init_q31.c
@@ -0,0 +1,246 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_init_q31.c
+ * Description: RFFT & RIFFT Q31 initialisation function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+#include "arm_common_tables.h"
+#include "arm_const_structs.h"
+
+
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Initialization function for the Q31 RFFT/RIFFT.
+ @param[in,out] S points to an instance of the Q31 RFFT/RIFFT structure
+ @param[in] fftLenReal length of the FFT
+ @param[in] ifftFlagR flag that selects transform direction
+ - value = 0: forward transform
+ - value = 1: inverse transform
+ @param[in] bitReverseFlag flag that enables / disables bit reversal of output
+ - value = 0: disables bit reversal of output
+ - value = 1: enables bit reversal of output
+ @return execution status
+ - \ref ARM_MATH_SUCCESS : Operation successful
+ - \ref ARM_MATH_ARGUMENT_ERROR : fftLenReal is not a supported length
+
+ @par Details
+ The parameter fftLenReal specifies length of RFFT/RIFFT Process.
+ Supported FFT Lengths are 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192.
+ @par
+ The parameter ifftFlagR controls whether a forward or inverse transform is computed.
+ Set(=1) ifftFlagR to calculate RIFFT, otherwise RFFT is calculated.
+ @par
+ The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+ Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+ @par
+ This function also initializes Twiddle factor table.
+*/
+
+arm_status arm_rfft_init_q31(
+ arm_rfft_instance_q31 * S,
+ uint32_t fftLenReal,
+ uint32_t ifftFlagR,
+ uint32_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_ARGUMENT_ERROR;
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_FFT_ALLOW_TABLES)
+
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || defined(ARM_TABLE_REALCOEF_Q31)
+
+ /* Initialise the default arm status */
+ status = ARM_MATH_SUCCESS;
+
+ /* Initialize the Real FFT length */
+ S->fftLenReal = (uint16_t) fftLenReal;
+
+ /* Initialize the Twiddle coefficientA pointer */
+ S->pTwiddleAReal = (q31_t *) realCoefAQ31;
+
+ /* Initialize the Twiddle coefficientB pointer */
+ S->pTwiddleBReal = (q31_t *) realCoefBQ31;
+
+ /* Initialize the Flag for selection of RFFT or RIFFT */
+ S->ifftFlagR = (uint8_t) ifftFlagR;
+
+ /* Initialize the Flag for calculation Bit reversal or not */
+ S->bitReverseFlagR = (uint8_t) bitReverseFlag;
+
+ /* Initialization of coef modifier depending on the FFT length */
+ switch (S->fftLenReal)
+ {
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_4096) && defined(ARM_TABLE_BITREVIDX_FXT_4096))
+ case 8192U:
+
+
+ S->twidCoefRModifier = 1U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),4096);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len4096;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_2048) && defined(ARM_TABLE_BITREVIDX_FXT_2048))
+ case 4096U:
+ S->twidCoefRModifier = 2U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),2048);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len2048;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_1024) && defined(ARM_TABLE_BITREVIDX_FXT_1024))
+ case 2048U:
+ S->twidCoefRModifier = 4U;
+
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),1024);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len1024;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_512) && defined(ARM_TABLE_BITREVIDX_FXT_512))
+ case 1024U:
+ S->twidCoefRModifier = 8U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),512);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len512;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_256) && defined(ARM_TABLE_BITREVIDX_FXT_256))
+ case 512U:
+ S->twidCoefRModifier = 16U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),256);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len256;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_128) && defined(ARM_TABLE_BITREVIDX_FXT_128))
+ case 256U:
+ S->twidCoefRModifier = 32U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),128);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len128;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_64) && defined(ARM_TABLE_BITREVIDX_FXT_64))
+ case 128U:
+ S->twidCoefRModifier = 64U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),64);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len64;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_32) && defined(ARM_TABLE_BITREVIDX_FXT_32))
+ case 64U:
+ S->twidCoefRModifier = 128U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),32);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len32;
+ #endif
+ break;
+#endif
+#if !defined(ARM_DSP_CONFIG_TABLES) || defined(ARM_ALL_FFT_TABLES) || (defined(ARM_TABLE_TWIDDLECOEF_Q31_16) && defined(ARM_TABLE_BITREVIDX_FXT_16))
+ case 32U:
+ S->twidCoefRModifier = 256U;
+ #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ status=arm_cfft_init_q31(&(S->cfftInst),16);
+ if (status != ARM_MATH_SUCCESS)
+ {
+ return(status);
+ }
+ #else
+ S->pCfft = &arm_cfft_sR_q31_len16;
+ #endif
+ break;
+#endif
+ default:
+ /* Reporting argument error if rfftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+#endif
+#endif
+ /* return the status of RFFT Init function */
+ return (status);
+}
+
+/**
+ @} end of RealFFT group
+ */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
new file mode 100644
index 0000000000..d321571d91
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q15.c
@@ -0,0 +1,526 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_q15.c
+ * Description: RFFT & RIFFT Q15 process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+void arm_split_rfft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier);
+
+void arm_split_rifft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier);
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Q15 RFFT/RIFFT.
+ @param[in] S points to an instance of the Q15 RFFT/RIFFT structure
+ @param[in] pSrc points to input buffer (Source buffer is modified by this function.)
+ @param[out] pDst points to output buffer
+ @return none
+
+ @par Input an output formats
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different RFFT sizes.
+ The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+ @par
+ \image html RFFTQ15.gif "Input and Output Formats for Q15 RFFT"
+ @par
+ \image html RIFFTQ15.gif "Input and Output Formats for Q15 RIFFT"
+ @par
+ If the input buffer is of length N, the output buffer must have length 2*N.
+ The input buffer is modified by this function.
+ @par
+ For the RIFFT, the source buffer must at least have length
+ fftLenReal + 2.
+ The last two elements must be equal to what would be generated
+ by the RFFT:
+ (pSrc[0] - pSrc[1]) >> 1 and 0
+ */
+
+void arm_rfft_q15(
+ const arm_rfft_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst)
+{
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ const arm_cfft_instance_q15 *S_CFFT = &(S->cfftInst);
+#else
+ const arm_cfft_instance_q15 *S_CFFT = S->pCfft;
+#endif
+ uint32_t L2 = S->fftLenReal >> 1U;
+
+ /* Calculation of RIFFT of input */
+ if (S->ifftFlagR == 1U)
+ {
+ /* Real IFFT core process */
+ arm_split_rifft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+ /* Complex IFFT process */
+ arm_cfft_q15 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+
+ arm_shift_q15(pDst, 1, pDst, S->fftLenReal);
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
+
+ /* Complex FFT process */
+ arm_cfft_q15 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+ /* Real FFT core process */
+ arm_split_rfft_q15 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ }
+
+}
+
+/**
+ @} end of RealFFT group
+ */
+
+/**
+ @brief Core Real FFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+
+ @par
+ The function implements a Real FFT
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+
+#if defined(__CMSIS_GCC_H)
+#define MVE_CMPLX_MULT_FX_AxB_S16(A,B) vqdmladhxq_s16(vqdmlsdhq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
+#define MVE_CMPLX_MULT_FX_AxConjB_S16(A,B) vqdmladhq_s16(vqdmlsdhxq_s16((__typeof(A))vuninitializedq_s16(), A, B), A, B)
+
+#endif
+
+void arm_split_rfft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pOut1 = &pDst[2];
+ q15_t *pIn1 = &pSrc[2];
+ uint16x8_t offsetIn = { 6, 7, 4, 5, 2, 3, 0, 1 };
+ uint16x8_t offsetCoef;
+ const uint16_t offsetCoefArr[16] = {
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
+ offsetIn = vaddq_n_u16(offsetIn, (2 * fftLen - 8));
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
+
+ const q15_t *pCoefAb, *pCoefBb;
+ pCoefAb = pCoefA;
+ pCoefBb = pCoefB;
+
+ pIn1 = &pSrc[2];
+
+ i = fftLen - 1U;
+ i = i / 4 + 1;
+ while (i > 0U) {
+ q15x8_t in1 = vld1q_s16(pIn1);
+ q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offsetIn);
+ q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
+ q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
+
+#if defined(__CMSIS_GCC_H)
+ q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB_S16(in1, coefA),
+ MVE_CMPLX_MULT_FX_AxConjB_S16(coefB, in2));
+#else
+ q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxB(in1, coefA),
+ MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
+#endif
+ vst1q_s16(pOut1, out);
+ pOut1 += 8;
+
+ offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
+ offsetIn -= 8;
+ pIn1 += 8;
+ i -= 1;
+ }
+
+ pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2 * fftLen + 1] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+ pDst[1] = 0;
+}
+#else
+void arm_split_rfft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pSrc1, *pSrc2;
+#if defined (ARM_MATH_DSP)
+ q15_t *pD1, *pD2;
+#endif
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
+
+ pSrc1 = &pSrc[2];
+ pSrc2 = &pSrc[(2U * fftLen) - 2U];
+
+#if defined (ARM_MATH_DSP)
+
+ i = 1U;
+ pD1 = pDst + 2;
+ pD2 = pDst + (4U * fftLen) - 2;
+
+ for (i = fftLen - 1; i > 0; i--)
+ {
+ /*
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i])
+ */
+
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* pSrc[2 * i] * pATable[2 * i] - pSrc[2 * i + 1] * pATable[2 * i + 1] */
+ outR = __SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA));
+#else
+ /* -(pSrc[2 * i + 1] * pATable[2 * i + 1] - pSrc[2 * i] * pATable[2 * i]) */
+ outR = -(__SMUSD(read_q15x2 (pSrc1), read_q15x2((q15_t *) pCoefA)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* pSrc[2 * n - 2 * i] * pBTable[2 * i] + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+ outR = __SMLAD(read_q15x2 (pSrc2), read_q15x2((q15_t *) pCoefB), outR) >> 16U;
+
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ outI = __SMUSDX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
+#else
+ outI = __SMUSDX(read_q15x2 ((q15_t *) pCoefB), read_q15x2_da (&pSrc2));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* (pIn[2 * i + 1] * pATable[2 * i] + pIn[2 * i] * pATable[2 * i + 1] */
+ outI = __SMLADX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), outI);
+
+ /* write output */
+ *pD1++ = (q15_t) outR;
+ *pD1++ = outI >> 16U;
+
+ /* write complex conjugate output */
+ pD2[0] = (q15_t) outR;
+ pD2[1] = -(outI >> 16U);
+ pD2 -= 2;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2U * modifier);
+ pCoefA = pCoefA + (2U * modifier);
+ }
+
+ pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2U * fftLen + 1U] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+ pDst[1] = 0;
+
+#else
+
+ i = 1U;
+
+ while (i < fftLen)
+ {
+ /*
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+ */
+
+ outR = *pSrc1 * *pCoefA;
+ outR = outR - (*(pSrc1 + 1) * *(pCoefA + 1));
+ outR = outR + (*pSrc2 * *pCoefB);
+ outR = (outR + (*(pSrc2 + 1) * *(pCoefB + 1))) >> 16;
+
+ /*
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+ outI = *pSrc2 * *(pCoefB + 1);
+ outI = outI - (*(pSrc2 + 1) * *pCoefB);
+ outI = outI + (*(pSrc1 + 1) * *pCoefA);
+ outI = outI + (*pSrc1 * *(pCoefA + 1));
+
+ /* update input pointers */
+ pSrc1 += 2U;
+ pSrc2 -= 2U;
+
+ /* write output */
+ pDst[2U * i] = (q15_t) outR;
+ pDst[2U * i + 1U] = outI >> 16U;
+
+ /* write complex conjugate output */
+ pDst[(4U * fftLen) - (2U * i)] = (q15_t) outR;
+ pDst[((4U * fftLen) - (2U * i)) + 1U] = -(outI >> 16U);
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2U * modifier);
+ pCoefA = pCoefA + (2U * modifier);
+
+ i++;
+ }
+
+ pDst[2U * fftLen] = (pSrc[0] - pSrc[1]) >> 1;
+ pDst[2U * fftLen + 1U] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1;
+ pDst[1] = 0;
+
+#endif /* #if defined (ARM_MATH_DSP) */
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @brief Core Real IFFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+
+ @par
+ The function implements a Real IFFT
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+
+void arm_split_rifft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pIn1;
+ uint16x8_t offset = { 6, 7, 4, 5, 2, 3, 0, 1 };
+ uint16x8_t offsetCoef;
+ int16x8_t conj = { 1, -1, 1, -1, 1, -1, 1, -1 }; /* conjugate */
+ const uint16_t offsetCoefArr[16] = {
+ 0, 0, 2, 2, 4, 4, 6, 6,
+ 0, 1, 0, 1, 0, 1, 0, 1
+ };
+
+ offsetCoef = vmulq_n_u16(vld1q_u16(offsetCoefArr), modifier) + vld1q_u16(offsetCoefArr + 8);
+
+ offset = vaddq_n_u16(offset, (2 * fftLen - 6));
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
+
+ const q15_t *pCoefAb, *pCoefBb;
+ pCoefAb = pCoefA;
+ pCoefBb = pCoefB;
+
+ pIn1 = &pSrc[0];
+
+ i = fftLen;
+ i = i / 4;
+
+ while (i > 0U) {
+ q15x8_t in1 = vld1q_s16(pIn1);
+ q15x8_t in2 = vldrhq_gather_shifted_offset_s16(pSrc, offset);
+ q15x8_t coefA = vldrhq_gather_shifted_offset_s16(pCoefAb, offsetCoef);
+ q15x8_t coefB = vldrhq_gather_shifted_offset_s16(pCoefBb, offsetCoef);
+
+ /* can we avoid the conjugate here ? */
+ q15x8_t out = vhaddq_s16(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
+ vmulq(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
+
+ vst1q_s16(pDst, out);
+ pDst += 8;
+
+ offsetCoef = vaddq_n_u16(offsetCoef, modifier * 8);
+ offset -= 8;
+
+ pIn1 += 8;
+ i -= 1;
+ }
+}
+#else
+void arm_split_rifft_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ const q15_t * pATable,
+ const q15_t * pBTable,
+ q15_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q15_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q15_t *pSrc1, *pSrc2;
+ q15_t *pDst1 = &pDst[0];
+
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
+
+ pSrc1 = &pSrc[0];
+ pSrc2 = &pSrc[2 * fftLen];
+
+ i = fftLen;
+ while (i > 0U)
+ {
+ /*
+ outR = ( pIn[2 * i] * pATable[2 * i]
+ + pIn[2 * i + 1] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ - pIn[2 * i] * pATable[2 * i + 1]
+ - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+#if defined (ARM_MATH_DSP)
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i] - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]) */
+ outR = __SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB));
+#else
+ /* -(-pIn[2 * n - 2 * i] * pBTable[2 * i] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1])) */
+ outR = -(__SMUSD(read_q15x2(pSrc2), read_q15x2((q15_t *) pCoefB)));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* pIn[2 * i] * pATable[2 * i] + pIn[2 * i + 1] * pATable[2 * i + 1] + pIn[2 * n - 2 * i] * pBTable[2 * i] */
+ outR = __SMLAD(read_q15x2(pSrc1), read_q15x2 ((q15_t *) pCoefA), outR) >> 16U;
+
+ /* -pIn[2 * n - 2 * i] * pBTable[2 * i + 1] + pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ outI = __SMUADX(read_q15x2_da (&pSrc2), read_q15x2((q15_t *) pCoefB));
+
+ /* pIn[2 * i + 1] * pATable[2 * i] - pIn[2 * i] * pATable[2 * i + 1] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ outI = __SMLSDX(read_q15x2 ((q15_t *) pCoefA), read_q15x2_ia (&pSrc1), -outI);
+#else
+ outI = __SMLSDX(read_q15x2_ia (&pSrc1), read_q15x2 ((q15_t *) pCoefA), -outI);
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* write output */
+#ifndef ARM_MATH_BIG_ENDIAN
+ write_q15x2_ia (&pDst1, __PKHBT(outR, (outI >> 16U), 16));
+#else
+ write_q15x2_ia (&pDst1, __PKHBT((outI >> 16U), outR, 16));
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+#else /* #if defined (ARM_MATH_DSP) */
+
+ outR = *pSrc2 * *pCoefB;
+ outR = outR - (*(pSrc2 + 1) * *(pCoefB + 1));
+ outR = outR + (*pSrc1 * *pCoefA);
+ outR = (outR + (*(pSrc1 + 1) * *(pCoefA + 1))) >> 16;
+
+ outI = *(pSrc1 + 1) * *pCoefA;
+ outI = outI - (*pSrc1 * *(pCoefA + 1));
+ outI = outI - (*pSrc2 * *(pCoefB + 1));
+ outI = outI - (*(pSrc2 + 1) * *(pCoefB));
+
+ /* update input pointers */
+ pSrc1 += 2U;
+ pSrc2 -= 2U;
+
+ /* write output */
+ *pDst1++ = (q15_t) outR;
+ *pDst1++ = (q15_t) (outI >> 16);
+
+#endif /* #if defined (ARM_MATH_DSP) */
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2 * modifier);
+ pCoefA = pCoefA + (2 * modifier);
+
+ i--;
+ }
+
+}
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
new file mode 100644
index 0000000000..f06c109122
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/TransformFunctions/arm_rfft_q31.c
@@ -0,0 +1,429 @@
+/* ----------------------------------------------------------------------
+ * Project: CMSIS DSP Library
+ * Title: arm_rfft_q31.c
+ * Description: FFT & RIFFT Q31 process function
+ *
+ * $Date: 23 April 2021
+ * $Revision: V1.9.0
+ *
+ * Target Processor: Cortex-M and Cortex-A cores
+ * -------------------------------------------------------------------- */
+/*
+ * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "dsp/transform_functions.h"
+
+/* ----------------------------------------------------------------------
+ * Internal functions prototypes
+ * -------------------------------------------------------------------- */
+
+void arm_split_rfft_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier);
+
+void arm_split_rifft_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier);
+
+/**
+ @addtogroup RealFFT
+ @{
+ */
+
+/**
+ @brief Processing function for the Q31 RFFT/RIFFT.
+ @param[in] S points to an instance of the Q31 RFFT/RIFFT structure
+ @param[in] pSrc points to input buffer (Source buffer is modified by this function)
+ @param[out] pDst points to output buffer
+ @return none
+
+ @par Input an output formats
+ Internally input is downscaled by 2 for every stage to avoid saturations inside CFFT/CIFFT process.
+ Hence the output format is different for different RFFT sizes.
+ The input and output formats for different RFFT sizes and number of bits to upscale are mentioned in the tables below for RFFT and RIFFT:
+ @par
+ \image html RFFTQ31.gif "Input and Output Formats for Q31 RFFT"
+ @par
+ \image html RIFFTQ31.gif "Input and Output Formats for Q31 RIFFT"
+ @par
+ If the input buffer is of length N, the output buffer must have length 2*N.
+ The input buffer is modified by this function.
+ @par
+ For the RIFFT, the source buffer must at least have length
+ fftLenReal + 2.
+ The last two elements must be equal to what would be generated
+ by the RFFT:
+ (pSrc[0] - pSrc[1]) >> 1 and 0
+
+ */
+
+void arm_rfft_q31(
+ const arm_rfft_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst)
+{
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+ const arm_cfft_instance_q31 *S_CFFT = &(S->cfftInst);
+#else
+ const arm_cfft_instance_q31 *S_CFFT = S->pCfft;
+#endif
+ uint32_t L2 = S->fftLenReal >> 1U;
+
+ /* Calculation of RIFFT of input */
+ if (S->ifftFlagR == 1U)
+ {
+ /* Real IFFT core process */
+ arm_split_rifft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+
+ /* Complex IFFT process */
+ arm_cfft_q31 (S_CFFT, pDst, S->ifftFlagR, S->bitReverseFlagR);
+
+ arm_shift_q31(pDst, 1, pDst, S->fftLenReal);
+ }
+ else
+ {
+ /* Calculation of RFFT of input */
+
+ /* Complex FFT process */
+ arm_cfft_q31 (S_CFFT, pSrc, S->ifftFlagR, S->bitReverseFlagR);
+
+ /* Real FFT core process */
+ arm_split_rfft_q31 (pSrc, L2, S->pTwiddleAReal, S->pTwiddleBReal, pDst, S->twidCoefRModifier);
+ }
+
+}
+
+/**
+ @} end of RealFFT group
+ */
+
+/**
+ @brief Core Real FFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+#include "arm_helium_utils.h"
+#include "arm_vec_fft.h"
+
+#if defined(__CMSIS_GCC_H)
+
+#define MVE_CMPLX_MULT_FX_AxB_S32(A,B) vqdmladhxq_s32(vqdmlsdhq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
+#define MVE_CMPLX_MULT_FX_AxConjB_S32(A,B) vqdmladhq_s32(vqdmlsdhxq_s32((__typeof(A))vuninitializedq_s32(), A, B), A, B)
+
+#endif
+
+void arm_split_rfft_q31(
+ q31_t *pSrc,
+ uint32_t fftLen,
+ const q31_t *pATable,
+ const q31_t *pBTable,
+ q31_t *pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t *pOut1 = &pDst[2];
+ q31_t *pIn1 = &pSrc[2];
+ uint32x4_t offset = { 2, 3, 0, 1 };
+ uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
+
+ offset = offset + (2 * fftLen - 4);
+
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
+
+ const q31_t *pCoefAb, *pCoefBb;
+ pCoefAb = pCoefA;
+ pCoefBb = pCoefB;
+
+ pIn1 = &pSrc[2];
+
+ i = fftLen - 1U;
+ i = i / 2 + 1;
+ while (i > 0U) {
+ q31x4_t in1 = vld1q_s32(pIn1);
+ q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
+ q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
+ q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
+#if defined(__CMSIS_GCC_H)
+ q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB_S32(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB_S32(coefB, in2));
+#else
+ q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxB(in1, coefA),MVE_CMPLX_MULT_FX_AxConjB(coefB, in2));
+#endif
+ vst1q(pOut1, out);
+ pOut1 += 4;
+
+ offsetCoef += modifier * 4;
+ offset -= 4;
+
+ pIn1 += 4;
+ i -= 1;
+ }
+
+ pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2 * fftLen + 1] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+ pDst[1] = 0;
+}
+#else
+void arm_split_rfft_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ q31_t outR, outI; /* Temporary variables for output */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ q31_t *pOut1 = &pDst[2], *pOut2 = &pDst[4 * fftLen - 1];
+ q31_t *pIn1 = &pSrc[2], *pIn2 = &pSrc[2 * fftLen - 1];
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[modifier * 2];
+ pCoefB = &pBTable[modifier * 2];
+
+ i = fftLen - 1U;
+
+ while (i > 0U)
+ {
+ /*
+ outR = ( pSrc[2 * i] * pATable[2 * i]
+ - pSrc[2 * i + 1] * pATable[2 * i + 1]
+ + pSrc[2 * n - 2 * i] * pBTable[2 * i]
+ + pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ + pIn[2 * i] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+ CoefA1 = *pCoefA++;
+ CoefA2 = *pCoefA;
+
+ /* outR = (pSrc[2 * i] * pATable[2 * i] */
+ mult_32x32_keep32_R (outR, *pIn1, CoefA1);
+
+ /* outI = pIn[2 * i] * pATable[2 * i + 1] */
+ mult_32x32_keep32_R (outI, *pIn1++, CoefA2);
+
+ /* - pSrc[2 * i + 1] * pATable[2 * i + 1] */
+ multSub_32x32_keep32_R (outR, *pIn1, CoefA2);
+
+ /* (pIn[2 * i + 1] * pATable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
+
+ /* pSrc[2 * n - 2 * i] * pBTable[2 * i] */
+ multSub_32x32_keep32_R (outR, *pIn2, CoefA2);
+ CoefB1 = *pCoefB;
+
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
+
+ /* pSrc[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
+
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefA2);
+
+ /* write output */
+ *pOut1++ = outR;
+ *pOut1++ = outI;
+
+ /* write complex conjugate output */
+ *pOut2-- = -outI;
+ *pOut2-- = outR;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (2 * modifier);
+ pCoefA = pCoefA + (2 * modifier - 1);
+
+ /* Decrement loop count */
+ i--;
+ }
+
+ pDst[2 * fftLen] = (pSrc[0] - pSrc[1]) >> 1U;
+ pDst[2 * fftLen + 1] = 0;
+
+ pDst[0] = (pSrc[0] + pSrc[1]) >> 1U;
+ pDst[1] = 0;
+}
+#endif /* defined(ARM_MATH_MVEI) */
+
+/**
+ @brief Core Real IFFT process
+ @param[in] pSrc points to input buffer
+ @param[in] fftLen length of FFT
+ @param[in] pATable points to twiddle Coef A buffer
+ @param[in] pBTable points to twiddle Coef B buffer
+ @param[out] pDst points to output buffer
+ @param[in] modifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table
+ @return none
+ */
+
+#if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
+
+void arm_split_rifft_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier)
+{
+ uint32_t i; /* Loop Counter */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t *pIn1;
+ uint32x4_t offset = { 2, 3, 0, 1 };
+ uint32x4_t offsetCoef = { 0, 1, modifier * 2, modifier * 2 + 1 };
+ int32x4_t conj = { 1, -1, 1, -1 };
+
+ offset = offset + (2 * fftLen - 2);
+
+ /* Init coefficient pointers */
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
+
+ const q31_t *pCoefAb, *pCoefBb;
+ pCoefAb = pCoefA;
+ pCoefBb = pCoefB;
+
+ pIn1 = &pSrc[0];
+
+ i = fftLen;
+ i = i >> 1;
+ while (i > 0U) {
+ q31x4_t in1 = vld1q_s32(pIn1);
+ q31x4_t in2 = vldrwq_gather_shifted_offset_s32(pSrc, offset);
+ q31x4_t coefA = vldrwq_gather_shifted_offset_s32(pCoefAb, offsetCoef);
+ q31x4_t coefB = vldrwq_gather_shifted_offset_s32(pCoefBb, offsetCoef);
+
+ /* can we avoid the conjugate here ? */
+#if defined(__CMSIS_GCC_H)
+ q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB_S32(in1, coefA),
+ vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB_S32(in2, coefB)));
+#else
+ q31x4_t out = vhaddq_s32(MVE_CMPLX_MULT_FX_AxConjB(in1, coefA),
+ vmulq_s32(conj, MVE_CMPLX_MULT_FX_AxB(in2, coefB)));
+#endif
+ vst1q_s32(pDst, out);
+ pDst += 4;
+
+ offsetCoef += modifier * 4;
+ offset -= 4;
+
+ pIn1 += 4;
+ i -= 1;
+ }
+}
+#else
+void arm_split_rifft_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ const q31_t * pATable,
+ const q31_t * pBTable,
+ q31_t * pDst,
+ uint32_t modifier)
+{
+ q31_t outR, outI; /* Temporary variables for output */
+ const q31_t *pCoefA, *pCoefB; /* Temporary pointers for twiddle factors */
+ q31_t CoefA1, CoefA2, CoefB1; /* Temporary variables for twiddle coefficients */
+ q31_t *pIn1 = &pSrc[0], *pIn2 = &pSrc[2 * fftLen + 1];
+
+ pCoefA = &pATable[0];
+ pCoefB = &pBTable[0];
+
+ while (fftLen > 0U)
+ {
+ /*
+ outR = ( pIn[2 * i] * pATable[2 * i]
+ + pIn[2 * i + 1] * pATable[2 * i + 1]
+ + pIn[2 * n - 2 * i] * pBTable[2 * i]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1]);
+
+ outI = ( pIn[2 * i + 1] * pATable[2 * i]
+ - pIn[2 * i] * pATable[2 * i + 1]
+ - pIn[2 * n - 2 * i] * pBTable[2 * i + 1]
+ - pIn[2 * n - 2 * i + 1] * pBTable[2 * i]);
+ */
+
+ CoefA1 = *pCoefA++;
+ CoefA2 = *pCoefA;
+
+ /* outR = (pIn[2 * i] * pATable[2 * i] */
+ mult_32x32_keep32_R (outR, *pIn1, CoefA1);
+
+ /* - pIn[2 * i] * pATable[2 * i + 1] */
+ mult_32x32_keep32_R (outI, *pIn1++, -CoefA2);
+
+ /* pIn[2 * i + 1] * pATable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn1, CoefA2);
+
+ /* pIn[2 * i + 1] * pATable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn1++, CoefA1);
+
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefA2);
+ CoefB1 = *pCoefB;
+
+ /* pIn[2 * n - 2 * i] * pBTable[2 * i + 1] */
+ multSub_32x32_keep32_R (outI, *pIn2--, CoefB1);
+
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i + 1] */
+ multAcc_32x32_keep32_R (outR, *pIn2, CoefB1);
+
+ /* pIn[2 * n - 2 * i + 1] * pBTable[2 * i] */
+ multAcc_32x32_keep32_R (outI, *pIn2--, CoefA2);
+
+ /* write output */
+ *pDst++ = outR;
+ *pDst++ = outI;
+
+ /* update coefficient pointer */
+ pCoefB = pCoefB + (modifier * 2);
+ pCoefA = pCoefA + (modifier * 2 - 1);
+
+ /* Decrement loop count */
+ fftLen--;
+ }
+
+}
+
+#endif /* defined(ARM_MATH_MVEI) */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/configDsp.cmake b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/configDsp.cmake
new file mode 100644
index 0000000000..3b3a1348c6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/configDsp.cmake
@@ -0,0 +1,47 @@
+function(configDsp project root)
+
+if (HOST)
+ target_compile_definitions(${project} PUBLIC __GNUC_PYTHON__)
+endif()
+
+if (CONFIGTABLE)
+ # Public because initialization for FFT may be defined in client code
+ # and needs access to the table.
+ target_compile_definitions(${project} PUBLIC ARM_DSP_CONFIG_TABLES)
+endif()
+
+if (LOOPUNROLL)
+ target_compile_definitions(${project} PRIVATE ARM_MATH_LOOPUNROLL)
+endif()
+
+if (ROUNDING)
+ target_compile_definitions(${project} PRIVATE ARM_MATH_ROUNDING)
+endif()
+
+if (MATRIXCHECK)
+ target_compile_definitions(${project} PRIVATE ARM_MATH_MATRIX_CHECK)
+endif()
+
+if (AUTOVECTORIZE)
+ target_compile_definitions(${project} PRIVATE ARM_MATH_AUTOVECTORIZE)
+endif()
+
+if (NEON OR NEONEXPERIMENTAL)
+ # Used in arm_vec_math.h
+ target_include_directories(${project} PUBLIC "${root}/CMSIS/DSP/ComputeLibrary/Include")
+endif()
+
+if (MVEFLOAT16)
+ target_compile_definitions(${project} PRIVATE ARM_MATH_MVE_FLOAT16)
+endif()
+
+if (HELIUM OR MVEF OR SUPPORT)
+ target_include_directories(${project} PRIVATE "${root}/CMSIS/DSP/PrivateInclude")
+endif()
+
+if (DISABLEFLOAT16)
+ target_compile_definitions(${project} PRIVATE DISABLEFLOAT16)
+endif()
+
+
+endfunction()
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/fft.cmake b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/fft.cmake
new file mode 100644
index 0000000000..05290ce815
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/fft.cmake
@@ -0,0 +1,859 @@
+function(fft PROJECT)
+#######################################
+#
+# CFFT F32
+#
+
+
+if (CONFIGTABLE AND CFFT_F32_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_16)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_32)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_64)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_128)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_256)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_512)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_1024)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_2048)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+endif()
+endif()
+
+if (CONFIGTABLE AND CFFT_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+if (HELIUM OR MVEF)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+else()
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_4096)
+endif()
+endif()
+
+#######################################
+#
+# CFFT F16
+#
+
+
+if (CONFIGTABLE AND CFFT_F16_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_F16_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# CFFT F64
+#
+
+
+if (CONFIGTABLE AND CFFT_F64_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_F64_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_4096)
+endif()
+
+#######################################
+#
+# CFFT Q31
+#
+
+if (CONFIGTABLE AND CFFT_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# CFFT Q15
+#
+
+if (CONFIGTABLE AND CFFT_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND CFFT_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# RFFT FAST F64
+#
+
+if (CONFIGTABLE AND RFFT_FAST_F64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F64_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT64_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F64_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F64_2048)
+endif()
+
+#######################################
+#
+# RFFT FAST F32
+#
+
+if (CONFIGTABLE AND RFFT_FAST_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F32_4096)
+endif()
+
+#######################################
+#
+# RFFT FAST F16
+#
+
+if (CONFIGTABLE AND RFFT_FAST_F16_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_FAST_F16_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FLT_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_RFFT_F16_4096)
+endif()
+
+#######################################
+#
+# RFFT F32
+#
+
+if (CONFIGTABLE AND RFFT_F32_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+
+#######################################
+#
+# RFFT F16 (for radix4 and radix2)
+#
+
+if (CONFIGTABLE AND RFFT_F16_32)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_64)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_128)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_256)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_512)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_1024)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_2048)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_4096)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+if (CONFIGTABLE AND RFFT_F16_8192)
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F16_4096)
+endif()
+
+#######################################
+#
+# RFFT Q31
+#
+
+if (CONFIGTABLE AND RFFT_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# RFFT FAST Q15
+#
+
+if (CONFIGTABLE AND RFFT_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_16)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_16)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_32)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_64)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_64)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_128)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_256)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_256)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_512)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_1024)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_2048)
+endif()
+
+if (CONFIGTABLE AND RFFT_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREVIDX_FXT_4096)
+endif()
+
+#######################################
+#
+# DCT4 F32
+#
+
+if (CONFIGTABLE AND DCT4_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_F32_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_F32)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_F32_4096)
+endif()
+
+#######################################
+#
+# DCT4 Q31
+#
+
+if (CONFIGTABLE AND DCT4_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q31_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q31)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+endif()
+
+#######################################
+#
+# DCT4 Q15
+#
+
+if (CONFIGTABLE AND DCT4_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_128)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_512)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_2048)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+if (CONFIGTABLE AND DCT4_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_DCT4_Q15_8192)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_REALCOEF_Q15)
+
+ # For cfft_radix4_init
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+endif()
+
+#######################################
+#
+# Deprecated radix2 and radix4 cfft
+#
+
+if (CONFIGTABLE AND (ARM_CFFT_RADIX2_Q15 OR ARM_CFFT_RADIX4_Q15))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q15_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+endif()
+
+if (CONFIGTABLE AND (ARM_CFFT_RADIX2_Q31 OR ARM_CFFT_RADIX4_Q31))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_TWIDDLECOEF_Q31_4096)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_BITREV_1024)
+endif()
+
+endfunction()
\ No newline at end of file
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/interpol.cmake b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/interpol.cmake
new file mode 100644
index 0000000000..2538d91b41
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/DSP/Source/interpol.cmake
@@ -0,0 +1,51 @@
+function(interpol PROJECT)
+
+if (CONFIGTABLE AND ARM_COS_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_COS_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_COS_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q15)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q15)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_COS_F32)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_F32)
+endif()
+
+if (CONFIGTABLE AND ARM_SIN_COS_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_SIN_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_LMS_NORM_Q31)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q31)
+endif()
+
+if (CONFIGTABLE AND ARM_LMS_NORM_Q15)
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_RECIP_Q15)
+endif()
+
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q31 AND (MVEI OR HELIUM))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q31_MVE)
+endif()
+
+if (CONFIGTABLE AND ARM_CMPLX_MAG_Q15 AND (MVEI OR HELIUM))
+ target_compile_definitions(${PROJECT} PUBLIC ARM_TABLE_FAST_SQRT_Q15_MVE)
+endif()
+
+endfunction()
\ No newline at end of file
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/LICENSE.txt b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/LICENSE.txt
new file mode 100644
index 0000000000..f0cd2d9ccf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/LICENSE.txt
@@ -0,0 +1,176 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_armcc.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_armcc.h
new file mode 100644
index 0000000000..a955d47139
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_armcc.h
@@ -0,0 +1,888 @@
+/**************************************************************************//**
+ * @file cmsis_armcc.h
+ * @brief CMSIS compiler ARMCC (Arm Compiler 5) header file
+ * @version V5.3.2
+ * @date 27. May 2021
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2021 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_ARMCC_H
+#define __CMSIS_ARMCC_H
+
+
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 400677)
+ #error "Please use Arm Compiler Toolchain V4.0.677 or later!"
+#endif
+
+/* CMSIS compiler control architecture macros */
+#if ((defined (__TARGET_ARCH_6_M ) && (__TARGET_ARCH_6_M == 1)) || \
+ (defined (__TARGET_ARCH_6S_M ) && (__TARGET_ARCH_6S_M == 1)) )
+ #define __ARM_ARCH_6M__ 1
+#endif
+
+#if (defined (__TARGET_ARCH_7_M ) && (__TARGET_ARCH_7_M == 1))
+ #define __ARM_ARCH_7M__ 1
+#endif
+
+#if (defined (__TARGET_ARCH_7E_M) && (__TARGET_ARCH_7E_M == 1))
+ #define __ARM_ARCH_7EM__ 1
+#endif
+
+ /* __ARM_ARCH_8M_BASE__ not applicable */
+ /* __ARM_ARCH_8M_MAIN__ not applicable */
+ /* __ARM_ARCH_8_1M_MAIN__ not applicable */
+
+/* CMSIS compiler control DSP macros */
+#if ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+ #define __ARM_FEATURE_DSP 1
+#endif
+
+/* CMSIS compiler specific defines */
+#ifndef __ASM
+ #define __ASM __asm
+#endif
+#ifndef __INLINE
+ #define __INLINE __inline
+#endif
+#ifndef __STATIC_INLINE
+ #define __STATIC_INLINE static __inline
+#endif
+#ifndef __STATIC_FORCEINLINE
+ #define __STATIC_FORCEINLINE static __forceinline
+#endif
+#ifndef __NO_RETURN
+ #define __NO_RETURN __declspec(noreturn)
+#endif
+#ifndef __USED
+ #define __USED __attribute__((used))
+#endif
+#ifndef __WEAK
+ #define __WEAK __attribute__((weak))
+#endif
+#ifndef __PACKED
+ #define __PACKED __attribute__((packed))
+#endif
+#ifndef __PACKED_STRUCT
+ #define __PACKED_STRUCT __packed struct
+#endif
+#ifndef __PACKED_UNION
+ #define __PACKED_UNION __packed union
+#endif
+#ifndef __UNALIGNED_UINT32 /* deprecated */
+ #define __UNALIGNED_UINT32(x) (*((__packed uint32_t *)(x)))
+#endif
+#ifndef __UNALIGNED_UINT16_WRITE
+ #define __UNALIGNED_UINT16_WRITE(addr, val) ((*((__packed uint16_t *)(addr))) = (val))
+#endif
+#ifndef __UNALIGNED_UINT16_READ
+ #define __UNALIGNED_UINT16_READ(addr) (*((const __packed uint16_t *)(addr)))
+#endif
+#ifndef __UNALIGNED_UINT32_WRITE
+ #define __UNALIGNED_UINT32_WRITE(addr, val) ((*((__packed uint32_t *)(addr))) = (val))
+#endif
+#ifndef __UNALIGNED_UINT32_READ
+ #define __UNALIGNED_UINT32_READ(addr) (*((const __packed uint32_t *)(addr)))
+#endif
+#ifndef __ALIGNED
+ #define __ALIGNED(x) __attribute__((aligned(x)))
+#endif
+#ifndef __RESTRICT
+ #define __RESTRICT __restrict
+#endif
+#ifndef __COMPILER_BARRIER
+ #define __COMPILER_BARRIER() __memory_changed()
+#endif
+
+/* ######################### Startup and Lowlevel Init ######################## */
+
+#ifndef __PROGRAM_START
+#define __PROGRAM_START __main
+#endif
+
+#ifndef __INITIAL_SP
+#define __INITIAL_SP Image$$ARM_LIB_STACK$$ZI$$Limit
+#endif
+
+#ifndef __STACK_LIMIT
+#define __STACK_LIMIT Image$$ARM_LIB_STACK$$ZI$$Base
+#endif
+
+#ifndef __VECTOR_TABLE
+#define __VECTOR_TABLE __Vectors
+#endif
+
+#ifndef __VECTOR_TABLE_ATTRIBUTE
+#define __VECTOR_TABLE_ATTRIBUTE __attribute__((used, section("RESET")))
+#endif
+
+/* ########################## Core Instruction Access ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+ Access to dedicated instructions
+ @{
+*/
+
+/**
+ \brief No Operation
+ \details No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP __nop
+
+
+/**
+ \brief Wait For Interrupt
+ \details Wait For Interrupt is a hint instruction that suspends execution until one of a number of events occurs.
+ */
+#define __WFI __wfi
+
+
+/**
+ \brief Wait For Event
+ \details Wait For Event is a hint instruction that permits the processor to enter
+ a low-power state until one of a number of events occurs.
+ */
+#define __WFE __wfe
+
+
+/**
+ \brief Send Event
+ \details Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV __sev
+
+
+/**
+ \brief Instruction Synchronization Barrier
+ \details Instruction Synchronization Barrier flushes the pipeline in the processor,
+ so that all instructions following the ISB are fetched from cache or memory,
+ after the instruction has been completed.
+ */
+#define __ISB() __isb(0xF)
+
+/**
+ \brief Data Synchronization Barrier
+ \details Acts as a special kind of Data Memory Barrier.
+ It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB() __dsb(0xF)
+
+/**
+ \brief Data Memory Barrier
+ \details Ensures the apparent order of the explicit memory operations before
+ and after the instruction, without ensuring their completion.
+ */
+#define __DMB() __dmb(0xF)
+
+
+/**
+ \brief Reverse byte order (32 bit)
+ \details Reverses the byte order in unsigned integer value. For example, 0x12345678 becomes 0x78563412.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __REV __rev
+
+
+/**
+ \brief Reverse byte order (16 bit)
+ \details Reverses the byte order within each halfword of a word. For example, 0x12345678 becomes 0x34127856.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rev16_text"))) __STATIC_INLINE __ASM uint32_t __REV16(uint32_t value)
+{
+ rev16 r0, r0
+ bx lr
+}
+#endif
+
+
+/**
+ \brief Reverse byte order (16 bit)
+ \details Reverses the byte order in a 16-bit value and returns the signed 16-bit result. For example, 0x0080 becomes 0x8000.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".revsh_text"))) __STATIC_INLINE __ASM int16_t __REVSH(int16_t value)
+{
+ revsh r0, r0
+ bx lr
+}
+#endif
+
+
+/**
+ \brief Rotate Right in unsigned value (32 bit)
+ \details Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+ \param [in] op1 Value to rotate
+ \param [in] op2 Number of Bits to rotate
+ \return Rotated value
+ */
+#define __ROR __ror
+
+
+/**
+ \brief Breakpoint
+ \details Causes the processor to enter Debug state.
+ Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+ \param [in] value is ignored by the processor.
+ If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value) __breakpoint(value)
+
+
+/**
+ \brief Reverse bit order of value
+ \details Reverses the bit order of the given value.
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+ #define __RBIT __rbit
+#else
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __RBIT(uint32_t value)
+{
+ uint32_t result;
+ uint32_t s = (4U /*sizeof(v)*/ * 8U) - 1U; /* extra shift needed at end */
+
+ result = value; /* r will be reversed bits of v; first get LSB of v */
+ for (value >>= 1U; value != 0U; value >>= 1U)
+ {
+ result <<= 1U;
+ result |= value & 1U;
+ s--;
+ }
+ result <<= s; /* shift when v's highest bits are zero */
+ return result;
+}
+#endif
+
+
+/**
+ \brief Count leading zeros
+ \details Counts the number of leading zeros of a data value.
+ \param [in] value Value to count the leading zeros
+ \return number of leading zeros in value
+ */
+#define __CLZ __clz
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+
+/**
+ \brief LDR Exclusive (8 bit)
+ \details Executes a exclusive LDR instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __LDREXB(ptr) ((uint8_t ) __ldrex(ptr))
+#else
+ #define __LDREXB(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint8_t ) __ldrex(ptr)) _Pragma("pop")
+#endif
+
+
+/**
+ \brief LDR Exclusive (16 bit)
+ \details Executes a exclusive LDR instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __LDREXH(ptr) ((uint16_t) __ldrex(ptr))
+#else
+ #define __LDREXH(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint16_t) __ldrex(ptr)) _Pragma("pop")
+#endif
+
+
+/**
+ \brief LDR Exclusive (32 bit)
+ \details Executes a exclusive LDR instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __LDREXW(ptr) ((uint32_t ) __ldrex(ptr))
+#else
+ #define __LDREXW(ptr) _Pragma("push") _Pragma("diag_suppress 3731") ((uint32_t ) __ldrex(ptr)) _Pragma("pop")
+#endif
+
+
+/**
+ \brief STR Exclusive (8 bit)
+ \details Executes a exclusive STR instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __STREXB(value, ptr) __strex(value, ptr)
+#else
+ #define __STREXB(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
+#endif
+
+
+/**
+ \brief STR Exclusive (16 bit)
+ \details Executes a exclusive STR instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __STREXH(value, ptr) __strex(value, ptr)
+#else
+ #define __STREXH(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
+#endif
+
+
+/**
+ \brief STR Exclusive (32 bit)
+ \details Executes a exclusive STR instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#if defined(__ARMCC_VERSION) && (__ARMCC_VERSION < 5060020)
+ #define __STREXW(value, ptr) __strex(value, ptr)
+#else
+ #define __STREXW(value, ptr) _Pragma("push") _Pragma("diag_suppress 3731") __strex(value, ptr) _Pragma("pop")
+#endif
+
+
+/**
+ \brief Remove the exclusive lock
+ \details Removes the exclusive lock which is created by LDREX.
+ */
+#define __CLREX __clrex
+
+
+/**
+ \brief Signed Saturate
+ \details Saturates a signed value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+#define __SSAT __ssat
+
+
+/**
+ \brief Unsigned Saturate
+ \details Saturates an unsigned value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+#define __USAT __usat
+
+
+/**
+ \brief Rotate Right with Extend (32 bit)
+ \details Moves each bit of a bitstring right by one bit.
+ The carry input is shifted in at the left end of the bitstring.
+ \param [in] value Value to rotate
+ \return Rotated value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rrx_text"))) __STATIC_INLINE __ASM uint32_t __RRX(uint32_t value)
+{
+ rrx r0, r0
+ bx lr
+}
+#endif
+
+
+/**
+ \brief LDRT Unprivileged (8 bit)
+ \details Executes a Unprivileged LDRT instruction for 8 bit value.
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#define __LDRBT(ptr) ((uint8_t ) __ldrt(ptr))
+
+
+/**
+ \brief LDRT Unprivileged (16 bit)
+ \details Executes a Unprivileged LDRT instruction for 16 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#define __LDRHT(ptr) ((uint16_t) __ldrt(ptr))
+
+
+/**
+ \brief LDRT Unprivileged (32 bit)
+ \details Executes a Unprivileged LDRT instruction for 32 bit values.
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#define __LDRT(ptr) ((uint32_t ) __ldrt(ptr))
+
+
+/**
+ \brief STRT Unprivileged (8 bit)
+ \details Executes a Unprivileged STRT instruction for 8 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRBT(value, ptr) __strt(value, ptr)
+
+
+/**
+ \brief STRT Unprivileged (16 bit)
+ \details Executes a Unprivileged STRT instruction for 16 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRHT(value, ptr) __strt(value, ptr)
+
+
+/**
+ \brief STRT Unprivileged (32 bit)
+ \details Executes a Unprivileged STRT instruction for 32 bit values.
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRT(value, ptr) __strt(value, ptr)
+
+#else /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) ) */
+
+/**
+ \brief Signed Saturate
+ \details Saturates a signed value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE int32_t __SSAT(int32_t val, uint32_t sat)
+{
+ if ((sat >= 1U) && (sat <= 32U))
+ {
+ const int32_t max = (int32_t)((1U << (sat - 1U)) - 1U);
+ const int32_t min = -1 - max ;
+ if (val > max)
+ {
+ return max;
+ }
+ else if (val < min)
+ {
+ return min;
+ }
+ }
+ return val;
+}
+
+/**
+ \brief Unsigned Saturate
+ \details Saturates an unsigned value.
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __USAT(int32_t val, uint32_t sat)
+{
+ if (sat <= 31U)
+ {
+ const uint32_t max = ((1U << sat) - 1U);
+ if (val > (int32_t)max)
+ {
+ return max;
+ }
+ else if (val < 0)
+ {
+ return 0U;
+ }
+ }
+ return (uint32_t)val;
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) ) */
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+
+/* ########################### Core Function Access ########################### */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+ @{
+ */
+
+/**
+ \brief Enable IRQ Interrupts
+ \details Enables IRQ interrupts by clearing special-purpose register PRIMASK.
+ Can only be executed in Privileged modes.
+ */
+/* intrinsic void __enable_irq(); */
+
+
+/**
+ \brief Disable IRQ Interrupts
+ \details Disables IRQ interrupts by setting special-purpose register PRIMASK.
+ Can only be executed in Privileged modes.
+ */
+/* intrinsic void __disable_irq(); */
+
+/**
+ \brief Get Control Register
+ \details Returns the content of the Control Register.
+ \return Control Register value
+ */
+__STATIC_INLINE uint32_t __get_CONTROL(void)
+{
+ register uint32_t __regControl __ASM("control");
+ return(__regControl);
+}
+
+
+/**
+ \brief Set Control Register
+ \details Writes the given value to the Control Register.
+ \param [in] control Control Register value to set
+ */
+__STATIC_INLINE void __set_CONTROL(uint32_t control)
+{
+ register uint32_t __regControl __ASM("control");
+ __regControl = control;
+ __ISB();
+}
+
+
+/**
+ \brief Get IPSR Register
+ \details Returns the content of the IPSR Register.
+ \return IPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_IPSR(void)
+{
+ register uint32_t __regIPSR __ASM("ipsr");
+ return(__regIPSR);
+}
+
+
+/**
+ \brief Get APSR Register
+ \details Returns the content of the APSR Register.
+ \return APSR Register value
+ */
+__STATIC_INLINE uint32_t __get_APSR(void)
+{
+ register uint32_t __regAPSR __ASM("apsr");
+ return(__regAPSR);
+}
+
+
+/**
+ \brief Get xPSR Register
+ \details Returns the content of the xPSR Register.
+ \return xPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_xPSR(void)
+{
+ register uint32_t __regXPSR __ASM("xpsr");
+ return(__regXPSR);
+}
+
+
+/**
+ \brief Get Process Stack Pointer
+ \details Returns the current value of the Process Stack Pointer (PSP).
+ \return PSP Register value
+ */
+__STATIC_INLINE uint32_t __get_PSP(void)
+{
+ register uint32_t __regProcessStackPointer __ASM("psp");
+ return(__regProcessStackPointer);
+}
+
+
+/**
+ \brief Set Process Stack Pointer
+ \details Assigns the given value to the Process Stack Pointer (PSP).
+ \param [in] topOfProcStack Process Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_PSP(uint32_t topOfProcStack)
+{
+ register uint32_t __regProcessStackPointer __ASM("psp");
+ __regProcessStackPointer = topOfProcStack;
+}
+
+
+/**
+ \brief Get Main Stack Pointer
+ \details Returns the current value of the Main Stack Pointer (MSP).
+ \return MSP Register value
+ */
+__STATIC_INLINE uint32_t __get_MSP(void)
+{
+ register uint32_t __regMainStackPointer __ASM("msp");
+ return(__regMainStackPointer);
+}
+
+
+/**
+ \brief Set Main Stack Pointer
+ \details Assigns the given value to the Main Stack Pointer (MSP).
+ \param [in] topOfMainStack Main Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_MSP(uint32_t topOfMainStack)
+{
+ register uint32_t __regMainStackPointer __ASM("msp");
+ __regMainStackPointer = topOfMainStack;
+}
+
+
+/**
+ \brief Get Priority Mask
+ \details Returns the current state of the priority mask bit from the Priority Mask Register.
+ \return Priority Mask value
+ */
+__STATIC_INLINE uint32_t __get_PRIMASK(void)
+{
+ register uint32_t __regPriMask __ASM("primask");
+ return(__regPriMask);
+}
+
+
+/**
+ \brief Set Priority Mask
+ \details Assigns the given value to the Priority Mask Register.
+ \param [in] priMask Priority Mask
+ */
+__STATIC_INLINE void __set_PRIMASK(uint32_t priMask)
+{
+ register uint32_t __regPriMask __ASM("primask");
+ __regPriMask = (priMask);
+}
+
+
+#if ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+
+/**
+ \brief Enable FIQ
+ \details Enables FIQ interrupts by clearing special-purpose register FAULTMASK.
+ Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq __enable_fiq
+
+
+/**
+ \brief Disable FIQ
+ \details Disables FIQ interrupts by setting special-purpose register FAULTMASK.
+ Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq __disable_fiq
+
+
+/**
+ \brief Get Base Priority
+ \details Returns the current value of the Base Priority register.
+ \return Base Priority register value
+ */
+__STATIC_INLINE uint32_t __get_BASEPRI(void)
+{
+ register uint32_t __regBasePri __ASM("basepri");
+ return(__regBasePri);
+}
+
+
+/**
+ \brief Set Base Priority
+ \details Assigns the given value to the Base Priority register.
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI(uint32_t basePri)
+{
+ register uint32_t __regBasePri __ASM("basepri");
+ __regBasePri = (basePri & 0xFFU);
+}
+
+
+/**
+ \brief Set Base Priority with condition
+ \details Assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+ or the new value increases the BASEPRI priority level.
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+ register uint32_t __regBasePriMax __ASM("basepri_max");
+ __regBasePriMax = (basePri & 0xFFU);
+}
+
+
+/**
+ \brief Get Fault Mask
+ \details Returns the current value of the Fault Mask register.
+ \return Fault Mask register value
+ */
+__STATIC_INLINE uint32_t __get_FAULTMASK(void)
+{
+ register uint32_t __regFaultMask __ASM("faultmask");
+ return(__regFaultMask);
+}
+
+
+/**
+ \brief Set Fault Mask
+ \details Assigns the given value to the Fault Mask register.
+ \param [in] faultMask Fault Mask value to set
+ */
+__STATIC_INLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+ register uint32_t __regFaultMask __ASM("faultmask");
+ __regFaultMask = (faultMask & (uint32_t)1U);
+}
+
+#endif /* ((defined (__ARM_ARCH_7M__ ) && (__ARM_ARCH_7M__ == 1)) || \
+ (defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) ) */
+
+
+/**
+ \brief Get FPSCR
+ \details Returns the current value of the Floating Point Status/Control register.
+ \return Floating Point Status/Control register value
+ */
+__STATIC_INLINE uint32_t __get_FPSCR(void)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+ (defined (__FPU_USED ) && (__FPU_USED == 1U)) )
+ register uint32_t __regfpscr __ASM("fpscr");
+ return(__regfpscr);
+#else
+ return(0U);
+#endif
+}
+
+
+/**
+ \brief Set FPSCR
+ \details Assigns the given value to the Floating Point Status/Control register.
+ \param [in] fpscr Floating Point Status/Control value to set
+ */
+__STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if ((defined (__FPU_PRESENT) && (__FPU_PRESENT == 1U)) && \
+ (defined (__FPU_USED ) && (__FPU_USED == 1U)) )
+ register uint32_t __regfpscr __ASM("fpscr");
+ __regfpscr = (fpscr);
+#else
+ (void)fpscr;
+#endif
+}
+
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+
+/* ################### Compiler specific Intrinsics ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+ Access to dedicated SIMD instructions
+ @{
+*/
+
+#if ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) )
+
+#define __SADD8 __sadd8
+#define __QADD8 __qadd8
+#define __SHADD8 __shadd8
+#define __UADD8 __uadd8
+#define __UQADD8 __uqadd8
+#define __UHADD8 __uhadd8
+#define __SSUB8 __ssub8
+#define __QSUB8 __qsub8
+#define __SHSUB8 __shsub8
+#define __USUB8 __usub8
+#define __UQSUB8 __uqsub8
+#define __UHSUB8 __uhsub8
+#define __SADD16 __sadd16
+#define __QADD16 __qadd16
+#define __SHADD16 __shadd16
+#define __UADD16 __uadd16
+#define __UQADD16 __uqadd16
+#define __UHADD16 __uhadd16
+#define __SSUB16 __ssub16
+#define __QSUB16 __qsub16
+#define __SHSUB16 __shsub16
+#define __USUB16 __usub16
+#define __UQSUB16 __uqsub16
+#define __UHSUB16 __uhsub16
+#define __SASX __sasx
+#define __QASX __qasx
+#define __SHASX __shasx
+#define __UASX __uasx
+#define __UQASX __uqasx
+#define __UHASX __uhasx
+#define __SSAX __ssax
+#define __QSAX __qsax
+#define __SHSAX __shsax
+#define __USAX __usax
+#define __UQSAX __uqsax
+#define __UHSAX __uhsax
+#define __USAD8 __usad8
+#define __USADA8 __usada8
+#define __SSAT16 __ssat16
+#define __USAT16 __usat16
+#define __UXTB16 __uxtb16
+#define __UXTAB16 __uxtab16
+#define __SXTB16 __sxtb16
+#define __SXTAB16 __sxtab16
+#define __SMUAD __smuad
+#define __SMUADX __smuadx
+#define __SMLAD __smlad
+#define __SMLADX __smladx
+#define __SMLALD __smlald
+#define __SMLALDX __smlaldx
+#define __SMUSD __smusd
+#define __SMUSDX __smusdx
+#define __SMLSD __smlsd
+#define __SMLSDX __smlsdx
+#define __SMLSLD __smlsld
+#define __SMLSLDX __smlsldx
+#define __SEL __sel
+#define __QADD __qadd
+#define __QSUB __qsub
+
+#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
+ ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
+
+#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \
+ ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) )
+
+#define __SMMLA(ARG1,ARG2,ARG3) ( (int32_t)((((int64_t)(ARG1) * (ARG2)) + \
+ ((int64_t)(ARG3) << 32U) ) >> 32U))
+
+#define __SXTB16_RORn(ARG1, ARG2) __SXTB16(__ROR(ARG1, ARG2))
+
+#define __SXTAB16_RORn(ARG1, ARG2, ARG3) __SXTAB16(ARG1, __ROR(ARG2, ARG3))
+
+#endif /* ((defined (__ARM_ARCH_7EM__) && (__ARM_ARCH_7EM__ == 1)) ) */
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#endif /* __CMSIS_ARMCC_H */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_compiler.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_compiler.h
new file mode 100644
index 0000000000..adbf296f15
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/cmsis_compiler.h
@@ -0,0 +1,283 @@
+/**************************************************************************//**
+ * @file cmsis_compiler.h
+ * @brief CMSIS compiler generic header file
+ * @version V5.1.0
+ * @date 09. October 2018
+ ******************************************************************************/
+/*
+ * Copyright (c) 2009-2018 Arm Limited. All rights reserved.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the License); you may
+ * not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an AS IS BASIS, WITHOUT
+ * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef __CMSIS_COMPILER_H
+#define __CMSIS_COMPILER_H
+
+#include
+
+/*
+ * Arm Compiler 4/5
+ */
+#if defined ( __CC_ARM )
+ #include "cmsis_armcc.h"
+
+
+/*
+ * Arm Compiler 6.6 LTM (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6010050) && (__ARMCC_VERSION < 6100100)
+ #include "cmsis_armclang_ltm.h"
+
+ /*
+ * Arm Compiler above 6.10.1 (armclang)
+ */
+#elif defined (__ARMCC_VERSION) && (__ARMCC_VERSION >= 6100100)
+ #include "cmsis_armclang.h"
+
+
+/*
+ * GNU Compiler
+ */
+#elif defined ( __GNUC__ )
+ #include "cmsis_gcc.h"
+
+
+/*
+ * IAR Compiler
+ */
+#elif defined ( __ICCARM__ )
+ #include
+
+
+/*
+ * TI Arm Compiler
+ */
+#elif defined ( __TI_ARM__ )
+ #include
+
+ #ifndef __ASM
+ #define __ASM __asm
+ #endif
+ #ifndef __INLINE
+ #define __INLINE inline
+ #endif
+ #ifndef __STATIC_INLINE
+ #define __STATIC_INLINE static inline
+ #endif
+ #ifndef __STATIC_FORCEINLINE
+ #define __STATIC_FORCEINLINE __STATIC_INLINE
+ #endif
+ #ifndef __NO_RETURN
+ #define __NO_RETURN __attribute__((noreturn))
+ #endif
+ #ifndef __USED
+ #define __USED __attribute__((used))
+ #endif
+ #ifndef __WEAK
+ #define __WEAK __attribute__((weak))
+ #endif
+ #ifndef __PACKED
+ #define __PACKED __attribute__((packed))
+ #endif
+ #ifndef __PACKED_STRUCT
+ #define __PACKED_STRUCT struct __attribute__((packed))
+ #endif
+ #ifndef __PACKED_UNION
+ #define __PACKED_UNION union __attribute__((packed))
+ #endif
+ #ifndef __UNALIGNED_UINT32 /* deprecated */
+ struct __attribute__((packed)) T_UINT32 { uint32_t v; };
+ #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT16_WRITE
+ __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+ #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void*)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT16_READ
+ __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+ #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT32_WRITE
+ __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+ #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT32_READ
+ __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+ #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __ALIGNED
+ #define __ALIGNED(x) __attribute__((aligned(x)))
+ #endif
+ #ifndef __RESTRICT
+ #define __RESTRICT __restrict
+ #endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
+ #endif
+
+
+/*
+ * TASKING Compiler
+ */
+#elif defined ( __TASKING__ )
+ /*
+ * The CMSIS functions have been implemented as intrinsics in the compiler.
+ * Please use "carm -?i" to get an up to date list of all intrinsics,
+ * Including the CMSIS ones.
+ */
+
+ #ifndef __ASM
+ #define __ASM __asm
+ #endif
+ #ifndef __INLINE
+ #define __INLINE inline
+ #endif
+ #ifndef __STATIC_INLINE
+ #define __STATIC_INLINE static inline
+ #endif
+ #ifndef __STATIC_FORCEINLINE
+ #define __STATIC_FORCEINLINE __STATIC_INLINE
+ #endif
+ #ifndef __NO_RETURN
+ #define __NO_RETURN __attribute__((noreturn))
+ #endif
+ #ifndef __USED
+ #define __USED __attribute__((used))
+ #endif
+ #ifndef __WEAK
+ #define __WEAK __attribute__((weak))
+ #endif
+ #ifndef __PACKED
+ #define __PACKED __packed__
+ #endif
+ #ifndef __PACKED_STRUCT
+ #define __PACKED_STRUCT struct __packed__
+ #endif
+ #ifndef __PACKED_UNION
+ #define __PACKED_UNION union __packed__
+ #endif
+ #ifndef __UNALIGNED_UINT32 /* deprecated */
+ struct __packed__ T_UINT32 { uint32_t v; };
+ #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT16_WRITE
+ __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+ #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT16_READ
+ __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+ #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT32_WRITE
+ __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+ #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT32_READ
+ __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+ #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __ALIGNED
+ #define __ALIGNED(x) __align(x)
+ #endif
+ #ifndef __RESTRICT
+ #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+ #define __RESTRICT
+ #endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
+ #endif
+
+
+/*
+ * COSMIC Compiler
+ */
+#elif defined ( __CSMC__ )
+ #include
+
+ #ifndef __ASM
+ #define __ASM _asm
+ #endif
+ #ifndef __INLINE
+ #define __INLINE inline
+ #endif
+ #ifndef __STATIC_INLINE
+ #define __STATIC_INLINE static inline
+ #endif
+ #ifndef __STATIC_FORCEINLINE
+ #define __STATIC_FORCEINLINE __STATIC_INLINE
+ #endif
+ #ifndef __NO_RETURN
+ // NO RETURN is automatically detected hence no warning here
+ #define __NO_RETURN
+ #endif
+ #ifndef __USED
+ #warning No compiler specific solution for __USED. __USED is ignored.
+ #define __USED
+ #endif
+ #ifndef __WEAK
+ #define __WEAK __weak
+ #endif
+ #ifndef __PACKED
+ #define __PACKED @packed
+ #endif
+ #ifndef __PACKED_STRUCT
+ #define __PACKED_STRUCT @packed struct
+ #endif
+ #ifndef __PACKED_UNION
+ #define __PACKED_UNION @packed union
+ #endif
+ #ifndef __UNALIGNED_UINT32 /* deprecated */
+ @packed struct T_UINT32 { uint32_t v; };
+ #define __UNALIGNED_UINT32(x) (((struct T_UINT32 *)(x))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT16_WRITE
+ __PACKED_STRUCT T_UINT16_WRITE { uint16_t v; };
+ #define __UNALIGNED_UINT16_WRITE(addr, val) (void)((((struct T_UINT16_WRITE *)(void *)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT16_READ
+ __PACKED_STRUCT T_UINT16_READ { uint16_t v; };
+ #define __UNALIGNED_UINT16_READ(addr) (((const struct T_UINT16_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __UNALIGNED_UINT32_WRITE
+ __PACKED_STRUCT T_UINT32_WRITE { uint32_t v; };
+ #define __UNALIGNED_UINT32_WRITE(addr, val) (void)((((struct T_UINT32_WRITE *)(void *)(addr))->v) = (val))
+ #endif
+ #ifndef __UNALIGNED_UINT32_READ
+ __PACKED_STRUCT T_UINT32_READ { uint32_t v; };
+ #define __UNALIGNED_UINT32_READ(addr) (((const struct T_UINT32_READ *)(const void *)(addr))->v)
+ #endif
+ #ifndef __ALIGNED
+ #warning No compiler specific solution for __ALIGNED. __ALIGNED is ignored.
+ #define __ALIGNED(x)
+ #endif
+ #ifndef __RESTRICT
+ #warning No compiler specific solution for __RESTRICT. __RESTRICT is ignored.
+ #define __RESTRICT
+ #endif
+ #ifndef __COMPILER_BARRIER
+ #warning No compiler specific solution for __COMPILER_BARRIER. __COMPILER_BARRIER is ignored.
+ #define __COMPILER_BARRIER() (void)0
+ #endif
+
+
+#else
+ #error Unknown compiler.
+#endif
+
+
+#endif /* __CMSIS_COMPILER_H */
+
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cm4.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cm4.h
new file mode 100644
index 0000000000..1951049396
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cm4.h
@@ -0,0 +1,1858 @@
+/**************************************************************************//**
+ * @file core_cm4.h
+ * @brief CMSIS Cortex-M4 Core Peripheral Access Layer Header File
+ * @version V4.10
+ * @date 18. March 2015
+ *
+ * @note
+ *
+ ******************************************************************************/
+/* Copyright (c) 2009 - 2015 ARM LIMITED
+
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of ARM nor the names of its contributors may be used
+ to endorse or promote products derived from this software without
+ specific prior written permission.
+ *
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS AND CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------*/
+
+
+#if defined ( __ICCARM__ )
+ #pragma system_include /* treat file as system include file for MISRA check */
+#endif
+
+#ifndef __CORE_CM4_H_GENERIC
+#define __CORE_CM4_H_GENERIC
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/** \page CMSIS_MISRA_Exceptions MISRA-C:2004 Compliance Exceptions
+ CMSIS violates the following MISRA-C:2004 rules:
+
+ \li Required Rule 8.5, object/function definition in header file.
+ Function definitions in header files are used to allow 'inlining'.
+
+ \li Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Unions are used for effective representation of core registers.
+
+ \li Advisory Rule 19.7, Function-like macro defined.
+ Function-like macros are used to allow more efficient code.
+ */
+
+
+/*******************************************************************************
+ * CMSIS definitions
+ ******************************************************************************/
+/** \ingroup Cortex_M4
+ @{
+ */
+
+/* CMSIS CM4 definitions */
+#define __CM4_CMSIS_VERSION_MAIN (0x04) /*!< [31:16] CMSIS HAL main version */
+#define __CM4_CMSIS_VERSION_SUB (0x00) /*!< [15:0] CMSIS HAL sub version */
+#define __CM4_CMSIS_VERSION ((__CM4_CMSIS_VERSION_MAIN << 16) | \
+ __CM4_CMSIS_VERSION_SUB ) /*!< CMSIS HAL version number */
+
+#define __CORTEX_M (0x04) /*!< Cortex-M Core */
+
+
+#if defined ( __CC_ARM )
+ #define __ASM __asm /*!< asm keyword for ARM Compiler */
+ #define __INLINE __inline /*!< inline keyword for ARM Compiler */
+ #define __STATIC_INLINE static __inline
+
+#elif defined ( __GNUC__ )
+ #define __ASM __asm /*!< asm keyword for GNU Compiler */
+ #define __INLINE inline /*!< inline keyword for GNU Compiler */
+ #define __STATIC_INLINE static inline
+
+#elif defined ( __ICCARM__ )
+ #define __ASM __asm /*!< asm keyword for IAR Compiler */
+ #define __INLINE inline /*!< inline keyword for IAR Compiler. Only available in High optimization mode! */
+ #define __STATIC_INLINE static inline
+
+#elif defined ( __TMS470__ )
+ #define __ASM __asm /*!< asm keyword for TI CCS Compiler */
+ #define __STATIC_INLINE static inline
+
+#elif defined ( __TASKING__ )
+ #define __ASM __asm /*!< asm keyword for TASKING Compiler */
+ #define __INLINE inline /*!< inline keyword for TASKING Compiler */
+ #define __STATIC_INLINE static inline
+
+#elif defined ( __CSMC__ )
+ #define __packed
+ #define __ASM _asm /*!< asm keyword for COSMIC Compiler */
+ #define __INLINE inline /*use -pc99 on compile line !< inline keyword for COSMIC Compiler */
+ #define __STATIC_INLINE static inline
+
+#endif
+
+/** __FPU_USED indicates whether an FPU is used or not.
+ For this, __FPU_PRESENT has to be checked prior to making use of FPU specific registers and functions.
+*/
+#if defined ( __CC_ARM )
+ #if defined __TARGET_FPU_VFP
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+
+#elif defined ( __GNUC__ )
+ #if defined (__VFP_FP__) && !defined(__SOFTFP__)
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+
+#elif defined ( __ICCARM__ )
+ #if defined __ARMVFP__
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+
+#elif defined ( __TMS470__ )
+ #if defined __TI_VFP_SUPPORT__
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #warning "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+
+#elif defined ( __TASKING__ )
+ #if defined __FPU_VFP__
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+
+#elif defined ( __CSMC__ ) /* Cosmic */
+ #if ( __CSMC__ & 0x400) // FPU present for parser
+ #if (__FPU_PRESENT == 1)
+ #define __FPU_USED 1
+ #else
+ #error "Compiler generates FPU instructions for a device without an FPU (check __FPU_PRESENT)"
+ #define __FPU_USED 0
+ #endif
+ #else
+ #define __FPU_USED 0
+ #endif
+#endif
+
+#include /* standard types definitions */
+#include /* Core Instruction Access */
+#include /* Core Function Access */
+#include /* Compiler specific SIMD Intrinsics */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_GENERIC */
+
+#ifndef __CMSIS_GENERIC
+
+#ifndef __CORE_CM4_H_DEPENDANT
+#define __CORE_CM4_H_DEPENDANT
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* check device defines and use defaults */
+#if defined __CHECK_DEVICE_DEFINES
+ #ifndef __CM4_REV
+ #define __CM4_REV 0x0000
+ #warning "__CM4_REV not defined in device header file; using default!"
+ #endif
+
+ #ifndef __FPU_PRESENT
+ #define __FPU_PRESENT 0
+ #warning "__FPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __MPU_PRESENT
+ #define __MPU_PRESENT 0
+ #warning "__MPU_PRESENT not defined in device header file; using default!"
+ #endif
+
+ #ifndef __NVIC_PRIO_BITS
+ #define __NVIC_PRIO_BITS 4
+ #warning "__NVIC_PRIO_BITS not defined in device header file; using default!"
+ #endif
+
+ #ifndef __Vendor_SysTickConfig
+ #define __Vendor_SysTickConfig 0
+ #warning "__Vendor_SysTickConfig not defined in device header file; using default!"
+ #endif
+#endif
+
+/* IO definitions (access restrictions to peripheral registers) */
+/**
+ \defgroup CMSIS_glob_defs CMSIS Global Defines
+
+ IO Type Qualifiers are used
+ \li to specify the access to peripheral variables.
+ \li for automatic generation of peripheral register debug information.
+*/
+#ifdef __cplusplus
+ #define __I volatile /*!< Defines 'read only' permissions */
+#else
+ #define __I volatile const /*!< Defines 'read only' permissions */
+#endif
+#define __O volatile /*!< Defines 'write only' permissions */
+#define __IO volatile /*!< Defines 'read / write' permissions */
+
+/*@} end of group Cortex_M4 */
+
+
+
+/*******************************************************************************
+ * Register Abstraction
+ Core Register contain:
+ - Core Register
+ - Core NVIC Register
+ - Core SCB Register
+ - Core SysTick Register
+ - Core Debug Register
+ - Core MPU Register
+ - Core FPU Register
+ ******************************************************************************/
+/** \defgroup CMSIS_core_register Defines and Type Definitions
+ \brief Type definitions and defines for Cortex-M processor based devices.
+*/
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_CORE Status and Control Registers
+ \brief Core Register type definitions.
+ @{
+ */
+
+/** \brief Union type to access the Application Program Status Register (APSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t _reserved0:16; /*!< bit: 0..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:7; /*!< bit: 20..26 Reserved */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} APSR_Type;
+
+/* APSR Register Definitions */
+#define APSR_N_Pos 31 /*!< APSR: N Position */
+#define APSR_N_Msk (1UL << APSR_N_Pos) /*!< APSR: N Mask */
+
+#define APSR_Z_Pos 30 /*!< APSR: Z Position */
+#define APSR_Z_Msk (1UL << APSR_Z_Pos) /*!< APSR: Z Mask */
+
+#define APSR_C_Pos 29 /*!< APSR: C Position */
+#define APSR_C_Msk (1UL << APSR_C_Pos) /*!< APSR: C Mask */
+
+#define APSR_V_Pos 28 /*!< APSR: V Position */
+#define APSR_V_Msk (1UL << APSR_V_Pos) /*!< APSR: V Mask */
+
+#define APSR_Q_Pos 27 /*!< APSR: Q Position */
+#define APSR_Q_Msk (1UL << APSR_Q_Pos) /*!< APSR: Q Mask */
+
+#define APSR_GE_Pos 16 /*!< APSR: GE Position */
+#define APSR_GE_Msk (0xFUL << APSR_GE_Pos) /*!< APSR: GE Mask */
+
+
+/** \brief Union type to access the Interrupt Program Status Register (IPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:23; /*!< bit: 9..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} IPSR_Type;
+
+/* IPSR Register Definitions */
+#define IPSR_ISR_Pos 0 /*!< IPSR: ISR Position */
+#define IPSR_ISR_Msk (0x1FFUL /*<< IPSR_ISR_Pos*/) /*!< IPSR: ISR Mask */
+
+
+/** \brief Union type to access the Special-Purpose Program Status Registers (xPSR).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t ISR:9; /*!< bit: 0.. 8 Exception number */
+ uint32_t _reserved0:7; /*!< bit: 9..15 Reserved */
+ uint32_t GE:4; /*!< bit: 16..19 Greater than or Equal flags */
+ uint32_t _reserved1:4; /*!< bit: 20..23 Reserved */
+ uint32_t T:1; /*!< bit: 24 Thumb bit (read 0) */
+ uint32_t IT:2; /*!< bit: 25..26 saved IT state (read 0) */
+ uint32_t Q:1; /*!< bit: 27 Saturation condition flag */
+ uint32_t V:1; /*!< bit: 28 Overflow condition code flag */
+ uint32_t C:1; /*!< bit: 29 Carry condition code flag */
+ uint32_t Z:1; /*!< bit: 30 Zero condition code flag */
+ uint32_t N:1; /*!< bit: 31 Negative condition code flag */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} xPSR_Type;
+
+/* xPSR Register Definitions */
+#define xPSR_N_Pos 31 /*!< xPSR: N Position */
+#define xPSR_N_Msk (1UL << xPSR_N_Pos) /*!< xPSR: N Mask */
+
+#define xPSR_Z_Pos 30 /*!< xPSR: Z Position */
+#define xPSR_Z_Msk (1UL << xPSR_Z_Pos) /*!< xPSR: Z Mask */
+
+#define xPSR_C_Pos 29 /*!< xPSR: C Position */
+#define xPSR_C_Msk (1UL << xPSR_C_Pos) /*!< xPSR: C Mask */
+
+#define xPSR_V_Pos 28 /*!< xPSR: V Position */
+#define xPSR_V_Msk (1UL << xPSR_V_Pos) /*!< xPSR: V Mask */
+
+#define xPSR_Q_Pos 27 /*!< xPSR: Q Position */
+#define xPSR_Q_Msk (1UL << xPSR_Q_Pos) /*!< xPSR: Q Mask */
+
+#define xPSR_IT_Pos 25 /*!< xPSR: IT Position */
+#define xPSR_IT_Msk (3UL << xPSR_IT_Pos) /*!< xPSR: IT Mask */
+
+#define xPSR_T_Pos 24 /*!< xPSR: T Position */
+#define xPSR_T_Msk (1UL << xPSR_T_Pos) /*!< xPSR: T Mask */
+
+#define xPSR_GE_Pos 16 /*!< xPSR: GE Position */
+#define xPSR_GE_Msk (0xFUL << xPSR_GE_Pos) /*!< xPSR: GE Mask */
+
+#define xPSR_ISR_Pos 0 /*!< xPSR: ISR Position */
+#define xPSR_ISR_Msk (0x1FFUL /*<< xPSR_ISR_Pos*/) /*!< xPSR: ISR Mask */
+
+
+/** \brief Union type to access the Control Registers (CONTROL).
+ */
+typedef union
+{
+ struct
+ {
+ uint32_t nPRIV:1; /*!< bit: 0 Execution privilege in Thread mode */
+ uint32_t SPSEL:1; /*!< bit: 1 Stack to be used */
+ uint32_t FPCA:1; /*!< bit: 2 FP extension active flag */
+ uint32_t _reserved0:29; /*!< bit: 3..31 Reserved */
+ } b; /*!< Structure used for bit access */
+ uint32_t w; /*!< Type used for word access */
+} CONTROL_Type;
+
+/* CONTROL Register Definitions */
+#define CONTROL_FPCA_Pos 2 /*!< CONTROL: FPCA Position */
+#define CONTROL_FPCA_Msk (1UL << CONTROL_FPCA_Pos) /*!< CONTROL: FPCA Mask */
+
+#define CONTROL_SPSEL_Pos 1 /*!< CONTROL: SPSEL Position */
+#define CONTROL_SPSEL_Msk (1UL << CONTROL_SPSEL_Pos) /*!< CONTROL: SPSEL Mask */
+
+#define CONTROL_nPRIV_Pos 0 /*!< CONTROL: nPRIV Position */
+#define CONTROL_nPRIV_Msk (1UL /*<< CONTROL_nPRIV_Pos*/) /*!< CONTROL: nPRIV Mask */
+
+/*@} end of group CMSIS_CORE */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_NVIC Nested Vectored Interrupt Controller (NVIC)
+ \brief Type definitions for the NVIC Registers
+ @{
+ */
+
+/** \brief Structure type to access the Nested Vectored Interrupt Controller (NVIC).
+ */
+typedef struct
+{
+ __IO uint32_t ISER[8]; /*!< Offset: 0x000 (R/W) Interrupt Set Enable Register */
+ uint32_t RESERVED0[24];
+ __IO uint32_t ICER[8]; /*!< Offset: 0x080 (R/W) Interrupt Clear Enable Register */
+ uint32_t RSERVED1[24];
+ __IO uint32_t ISPR[8]; /*!< Offset: 0x100 (R/W) Interrupt Set Pending Register */
+ uint32_t RESERVED2[24];
+ __IO uint32_t ICPR[8]; /*!< Offset: 0x180 (R/W) Interrupt Clear Pending Register */
+ uint32_t RESERVED3[24];
+ __IO uint32_t IABR[8]; /*!< Offset: 0x200 (R/W) Interrupt Active bit Register */
+ uint32_t RESERVED4[56];
+ __IO uint8_t IP[240]; /*!< Offset: 0x300 (R/W) Interrupt Priority Register (8Bit wide) */
+ uint32_t RESERVED5[644];
+ __O uint32_t STIR; /*!< Offset: 0xE00 ( /W) Software Trigger Interrupt Register */
+} NVIC_Type;
+
+/* Software Triggered Interrupt Register Definitions */
+#define NVIC_STIR_INTID_Pos 0 /*!< STIR: INTLINESNUM Position */
+#define NVIC_STIR_INTID_Msk (0x1FFUL /*<< NVIC_STIR_INTID_Pos*/) /*!< STIR: INTLINESNUM Mask */
+
+/*@} end of group CMSIS_NVIC */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCB System Control Block (SCB)
+ \brief Type definitions for the System Control Block Registers
+ @{
+ */
+
+/** \brief Structure type to access the System Control Block (SCB).
+ */
+typedef struct
+{
+ __I uint32_t CPUID; /*!< Offset: 0x000 (R/ ) CPUID Base Register */
+ __IO uint32_t ICSR; /*!< Offset: 0x004 (R/W) Interrupt Control and State Register */
+ __IO uint32_t VTOR; /*!< Offset: 0x008 (R/W) Vector Table Offset Register */
+ __IO uint32_t AIRCR; /*!< Offset: 0x00C (R/W) Application Interrupt and Reset Control Register */
+ __IO uint32_t SCR; /*!< Offset: 0x010 (R/W) System Control Register */
+ __IO uint32_t CCR; /*!< Offset: 0x014 (R/W) Configuration Control Register */
+ __IO uint8_t SHP[12]; /*!< Offset: 0x018 (R/W) System Handlers Priority Registers (4-7, 8-11, 12-15) */
+ __IO uint32_t SHCSR; /*!< Offset: 0x024 (R/W) System Handler Control and State Register */
+ __IO uint32_t CFSR; /*!< Offset: 0x028 (R/W) Configurable Fault Status Register */
+ __IO uint32_t HFSR; /*!< Offset: 0x02C (R/W) HardFault Status Register */
+ __IO uint32_t DFSR; /*!< Offset: 0x030 (R/W) Debug Fault Status Register */
+ __IO uint32_t MMFAR; /*!< Offset: 0x034 (R/W) MemManage Fault Address Register */
+ __IO uint32_t BFAR; /*!< Offset: 0x038 (R/W) BusFault Address Register */
+ __IO uint32_t AFSR; /*!< Offset: 0x03C (R/W) Auxiliary Fault Status Register */
+ __I uint32_t PFR[2]; /*!< Offset: 0x040 (R/ ) Processor Feature Register */
+ __I uint32_t DFR; /*!< Offset: 0x048 (R/ ) Debug Feature Register */
+ __I uint32_t ADR; /*!< Offset: 0x04C (R/ ) Auxiliary Feature Register */
+ __I uint32_t MMFR[4]; /*!< Offset: 0x050 (R/ ) Memory Model Feature Register */
+ __I uint32_t ISAR[5]; /*!< Offset: 0x060 (R/ ) Instruction Set Attributes Register */
+ uint32_t RESERVED0[5];
+ __IO uint32_t CPACR; /*!< Offset: 0x088 (R/W) Coprocessor Access Control Register */
+} SCB_Type;
+
+/* SCB CPUID Register Definitions */
+#define SCB_CPUID_IMPLEMENTER_Pos 24 /*!< SCB CPUID: IMPLEMENTER Position */
+#define SCB_CPUID_IMPLEMENTER_Msk (0xFFUL << SCB_CPUID_IMPLEMENTER_Pos) /*!< SCB CPUID: IMPLEMENTER Mask */
+
+#define SCB_CPUID_VARIANT_Pos 20 /*!< SCB CPUID: VARIANT Position */
+#define SCB_CPUID_VARIANT_Msk (0xFUL << SCB_CPUID_VARIANT_Pos) /*!< SCB CPUID: VARIANT Mask */
+
+#define SCB_CPUID_ARCHITECTURE_Pos 16 /*!< SCB CPUID: ARCHITECTURE Position */
+#define SCB_CPUID_ARCHITECTURE_Msk (0xFUL << SCB_CPUID_ARCHITECTURE_Pos) /*!< SCB CPUID: ARCHITECTURE Mask */
+
+#define SCB_CPUID_PARTNO_Pos 4 /*!< SCB CPUID: PARTNO Position */
+#define SCB_CPUID_PARTNO_Msk (0xFFFUL << SCB_CPUID_PARTNO_Pos) /*!< SCB CPUID: PARTNO Mask */
+
+#define SCB_CPUID_REVISION_Pos 0 /*!< SCB CPUID: REVISION Position */
+#define SCB_CPUID_REVISION_Msk (0xFUL /*<< SCB_CPUID_REVISION_Pos*/) /*!< SCB CPUID: REVISION Mask */
+
+/* SCB Interrupt Control State Register Definitions */
+#define SCB_ICSR_NMIPENDSET_Pos 31 /*!< SCB ICSR: NMIPENDSET Position */
+#define SCB_ICSR_NMIPENDSET_Msk (1UL << SCB_ICSR_NMIPENDSET_Pos) /*!< SCB ICSR: NMIPENDSET Mask */
+
+#define SCB_ICSR_PENDSVSET_Pos 28 /*!< SCB ICSR: PENDSVSET Position */
+#define SCB_ICSR_PENDSVSET_Msk (1UL << SCB_ICSR_PENDSVSET_Pos) /*!< SCB ICSR: PENDSVSET Mask */
+
+#define SCB_ICSR_PENDSVCLR_Pos 27 /*!< SCB ICSR: PENDSVCLR Position */
+#define SCB_ICSR_PENDSVCLR_Msk (1UL << SCB_ICSR_PENDSVCLR_Pos) /*!< SCB ICSR: PENDSVCLR Mask */
+
+#define SCB_ICSR_PENDSTSET_Pos 26 /*!< SCB ICSR: PENDSTSET Position */
+#define SCB_ICSR_PENDSTSET_Msk (1UL << SCB_ICSR_PENDSTSET_Pos) /*!< SCB ICSR: PENDSTSET Mask */
+
+#define SCB_ICSR_PENDSTCLR_Pos 25 /*!< SCB ICSR: PENDSTCLR Position */
+#define SCB_ICSR_PENDSTCLR_Msk (1UL << SCB_ICSR_PENDSTCLR_Pos) /*!< SCB ICSR: PENDSTCLR Mask */
+
+#define SCB_ICSR_ISRPREEMPT_Pos 23 /*!< SCB ICSR: ISRPREEMPT Position */
+#define SCB_ICSR_ISRPREEMPT_Msk (1UL << SCB_ICSR_ISRPREEMPT_Pos) /*!< SCB ICSR: ISRPREEMPT Mask */
+
+#define SCB_ICSR_ISRPENDING_Pos 22 /*!< SCB ICSR: ISRPENDING Position */
+#define SCB_ICSR_ISRPENDING_Msk (1UL << SCB_ICSR_ISRPENDING_Pos) /*!< SCB ICSR: ISRPENDING Mask */
+
+#define SCB_ICSR_VECTPENDING_Pos 12 /*!< SCB ICSR: VECTPENDING Position */
+#define SCB_ICSR_VECTPENDING_Msk (0x1FFUL << SCB_ICSR_VECTPENDING_Pos) /*!< SCB ICSR: VECTPENDING Mask */
+
+#define SCB_ICSR_RETTOBASE_Pos 11 /*!< SCB ICSR: RETTOBASE Position */
+#define SCB_ICSR_RETTOBASE_Msk (1UL << SCB_ICSR_RETTOBASE_Pos) /*!< SCB ICSR: RETTOBASE Mask */
+
+#define SCB_ICSR_VECTACTIVE_Pos 0 /*!< SCB ICSR: VECTACTIVE Position */
+#define SCB_ICSR_VECTACTIVE_Msk (0x1FFUL /*<< SCB_ICSR_VECTACTIVE_Pos*/) /*!< SCB ICSR: VECTACTIVE Mask */
+
+/* SCB Vector Table Offset Register Definitions */
+#define SCB_VTOR_TBLOFF_Pos 7 /*!< SCB VTOR: TBLOFF Position */
+#define SCB_VTOR_TBLOFF_Msk (0x1FFFFFFUL << SCB_VTOR_TBLOFF_Pos) /*!< SCB VTOR: TBLOFF Mask */
+
+/* SCB Application Interrupt and Reset Control Register Definitions */
+#define SCB_AIRCR_VECTKEY_Pos 16 /*!< SCB AIRCR: VECTKEY Position */
+#define SCB_AIRCR_VECTKEY_Msk (0xFFFFUL << SCB_AIRCR_VECTKEY_Pos) /*!< SCB AIRCR: VECTKEY Mask */
+
+#define SCB_AIRCR_VECTKEYSTAT_Pos 16 /*!< SCB AIRCR: VECTKEYSTAT Position */
+#define SCB_AIRCR_VECTKEYSTAT_Msk (0xFFFFUL << SCB_AIRCR_VECTKEYSTAT_Pos) /*!< SCB AIRCR: VECTKEYSTAT Mask */
+
+#define SCB_AIRCR_ENDIANESS_Pos 15 /*!< SCB AIRCR: ENDIANESS Position */
+#define SCB_AIRCR_ENDIANESS_Msk (1UL << SCB_AIRCR_ENDIANESS_Pos) /*!< SCB AIRCR: ENDIANESS Mask */
+
+#define SCB_AIRCR_PRIGROUP_Pos 8 /*!< SCB AIRCR: PRIGROUP Position */
+#define SCB_AIRCR_PRIGROUP_Msk (7UL << SCB_AIRCR_PRIGROUP_Pos) /*!< SCB AIRCR: PRIGROUP Mask */
+
+#define SCB_AIRCR_SYSRESETREQ_Pos 2 /*!< SCB AIRCR: SYSRESETREQ Position */
+#define SCB_AIRCR_SYSRESETREQ_Msk (1UL << SCB_AIRCR_SYSRESETREQ_Pos) /*!< SCB AIRCR: SYSRESETREQ Mask */
+
+#define SCB_AIRCR_VECTCLRACTIVE_Pos 1 /*!< SCB AIRCR: VECTCLRACTIVE Position */
+#define SCB_AIRCR_VECTCLRACTIVE_Msk (1UL << SCB_AIRCR_VECTCLRACTIVE_Pos) /*!< SCB AIRCR: VECTCLRACTIVE Mask */
+
+#define SCB_AIRCR_VECTRESET_Pos 0 /*!< SCB AIRCR: VECTRESET Position */
+#define SCB_AIRCR_VECTRESET_Msk (1UL /*<< SCB_AIRCR_VECTRESET_Pos*/) /*!< SCB AIRCR: VECTRESET Mask */
+
+/* SCB System Control Register Definitions */
+#define SCB_SCR_SEVONPEND_Pos 4 /*!< SCB SCR: SEVONPEND Position */
+#define SCB_SCR_SEVONPEND_Msk (1UL << SCB_SCR_SEVONPEND_Pos) /*!< SCB SCR: SEVONPEND Mask */
+
+#define SCB_SCR_SLEEPDEEP_Pos 2 /*!< SCB SCR: SLEEPDEEP Position */
+#define SCB_SCR_SLEEPDEEP_Msk (1UL << SCB_SCR_SLEEPDEEP_Pos) /*!< SCB SCR: SLEEPDEEP Mask */
+
+#define SCB_SCR_SLEEPONEXIT_Pos 1 /*!< SCB SCR: SLEEPONEXIT Position */
+#define SCB_SCR_SLEEPONEXIT_Msk (1UL << SCB_SCR_SLEEPONEXIT_Pos) /*!< SCB SCR: SLEEPONEXIT Mask */
+
+/* SCB Configuration Control Register Definitions */
+#define SCB_CCR_STKALIGN_Pos 9 /*!< SCB CCR: STKALIGN Position */
+#define SCB_CCR_STKALIGN_Msk (1UL << SCB_CCR_STKALIGN_Pos) /*!< SCB CCR: STKALIGN Mask */
+
+#define SCB_CCR_BFHFNMIGN_Pos 8 /*!< SCB CCR: BFHFNMIGN Position */
+#define SCB_CCR_BFHFNMIGN_Msk (1UL << SCB_CCR_BFHFNMIGN_Pos) /*!< SCB CCR: BFHFNMIGN Mask */
+
+#define SCB_CCR_DIV_0_TRP_Pos 4 /*!< SCB CCR: DIV_0_TRP Position */
+#define SCB_CCR_DIV_0_TRP_Msk (1UL << SCB_CCR_DIV_0_TRP_Pos) /*!< SCB CCR: DIV_0_TRP Mask */
+
+#define SCB_CCR_UNALIGN_TRP_Pos 3 /*!< SCB CCR: UNALIGN_TRP Position */
+#define SCB_CCR_UNALIGN_TRP_Msk (1UL << SCB_CCR_UNALIGN_TRP_Pos) /*!< SCB CCR: UNALIGN_TRP Mask */
+
+#define SCB_CCR_USERSETMPEND_Pos 1 /*!< SCB CCR: USERSETMPEND Position */
+#define SCB_CCR_USERSETMPEND_Msk (1UL << SCB_CCR_USERSETMPEND_Pos) /*!< SCB CCR: USERSETMPEND Mask */
+
+#define SCB_CCR_NONBASETHRDENA_Pos 0 /*!< SCB CCR: NONBASETHRDENA Position */
+#define SCB_CCR_NONBASETHRDENA_Msk (1UL /*<< SCB_CCR_NONBASETHRDENA_Pos*/) /*!< SCB CCR: NONBASETHRDENA Mask */
+
+/* SCB System Handler Control and State Register Definitions */
+#define SCB_SHCSR_USGFAULTENA_Pos 18 /*!< SCB SHCSR: USGFAULTENA Position */
+#define SCB_SHCSR_USGFAULTENA_Msk (1UL << SCB_SHCSR_USGFAULTENA_Pos) /*!< SCB SHCSR: USGFAULTENA Mask */
+
+#define SCB_SHCSR_BUSFAULTENA_Pos 17 /*!< SCB SHCSR: BUSFAULTENA Position */
+#define SCB_SHCSR_BUSFAULTENA_Msk (1UL << SCB_SHCSR_BUSFAULTENA_Pos) /*!< SCB SHCSR: BUSFAULTENA Mask */
+
+#define SCB_SHCSR_MEMFAULTENA_Pos 16 /*!< SCB SHCSR: MEMFAULTENA Position */
+#define SCB_SHCSR_MEMFAULTENA_Msk (1UL << SCB_SHCSR_MEMFAULTENA_Pos) /*!< SCB SHCSR: MEMFAULTENA Mask */
+
+#define SCB_SHCSR_SVCALLPENDED_Pos 15 /*!< SCB SHCSR: SVCALLPENDED Position */
+#define SCB_SHCSR_SVCALLPENDED_Msk (1UL << SCB_SHCSR_SVCALLPENDED_Pos) /*!< SCB SHCSR: SVCALLPENDED Mask */
+
+#define SCB_SHCSR_BUSFAULTPENDED_Pos 14 /*!< SCB SHCSR: BUSFAULTPENDED Position */
+#define SCB_SHCSR_BUSFAULTPENDED_Msk (1UL << SCB_SHCSR_BUSFAULTPENDED_Pos) /*!< SCB SHCSR: BUSFAULTPENDED Mask */
+
+#define SCB_SHCSR_MEMFAULTPENDED_Pos 13 /*!< SCB SHCSR: MEMFAULTPENDED Position */
+#define SCB_SHCSR_MEMFAULTPENDED_Msk (1UL << SCB_SHCSR_MEMFAULTPENDED_Pos) /*!< SCB SHCSR: MEMFAULTPENDED Mask */
+
+#define SCB_SHCSR_USGFAULTPENDED_Pos 12 /*!< SCB SHCSR: USGFAULTPENDED Position */
+#define SCB_SHCSR_USGFAULTPENDED_Msk (1UL << SCB_SHCSR_USGFAULTPENDED_Pos) /*!< SCB SHCSR: USGFAULTPENDED Mask */
+
+#define SCB_SHCSR_SYSTICKACT_Pos 11 /*!< SCB SHCSR: SYSTICKACT Position */
+#define SCB_SHCSR_SYSTICKACT_Msk (1UL << SCB_SHCSR_SYSTICKACT_Pos) /*!< SCB SHCSR: SYSTICKACT Mask */
+
+#define SCB_SHCSR_PENDSVACT_Pos 10 /*!< SCB SHCSR: PENDSVACT Position */
+#define SCB_SHCSR_PENDSVACT_Msk (1UL << SCB_SHCSR_PENDSVACT_Pos) /*!< SCB SHCSR: PENDSVACT Mask */
+
+#define SCB_SHCSR_MONITORACT_Pos 8 /*!< SCB SHCSR: MONITORACT Position */
+#define SCB_SHCSR_MONITORACT_Msk (1UL << SCB_SHCSR_MONITORACT_Pos) /*!< SCB SHCSR: MONITORACT Mask */
+
+#define SCB_SHCSR_SVCALLACT_Pos 7 /*!< SCB SHCSR: SVCALLACT Position */
+#define SCB_SHCSR_SVCALLACT_Msk (1UL << SCB_SHCSR_SVCALLACT_Pos) /*!< SCB SHCSR: SVCALLACT Mask */
+
+#define SCB_SHCSR_USGFAULTACT_Pos 3 /*!< SCB SHCSR: USGFAULTACT Position */
+#define SCB_SHCSR_USGFAULTACT_Msk (1UL << SCB_SHCSR_USGFAULTACT_Pos) /*!< SCB SHCSR: USGFAULTACT Mask */
+
+#define SCB_SHCSR_BUSFAULTACT_Pos 1 /*!< SCB SHCSR: BUSFAULTACT Position */
+#define SCB_SHCSR_BUSFAULTACT_Msk (1UL << SCB_SHCSR_BUSFAULTACT_Pos) /*!< SCB SHCSR: BUSFAULTACT Mask */
+
+#define SCB_SHCSR_MEMFAULTACT_Pos 0 /*!< SCB SHCSR: MEMFAULTACT Position */
+#define SCB_SHCSR_MEMFAULTACT_Msk (1UL /*<< SCB_SHCSR_MEMFAULTACT_Pos*/) /*!< SCB SHCSR: MEMFAULTACT Mask */
+
+/* SCB Configurable Fault Status Registers Definitions */
+#define SCB_CFSR_USGFAULTSR_Pos 16 /*!< SCB CFSR: Usage Fault Status Register Position */
+#define SCB_CFSR_USGFAULTSR_Msk (0xFFFFUL << SCB_CFSR_USGFAULTSR_Pos) /*!< SCB CFSR: Usage Fault Status Register Mask */
+
+#define SCB_CFSR_BUSFAULTSR_Pos 8 /*!< SCB CFSR: Bus Fault Status Register Position */
+#define SCB_CFSR_BUSFAULTSR_Msk (0xFFUL << SCB_CFSR_BUSFAULTSR_Pos) /*!< SCB CFSR: Bus Fault Status Register Mask */
+
+#define SCB_CFSR_MEMFAULTSR_Pos 0 /*!< SCB CFSR: Memory Manage Fault Status Register Position */
+#define SCB_CFSR_MEMFAULTSR_Msk (0xFFUL /*<< SCB_CFSR_MEMFAULTSR_Pos*/) /*!< SCB CFSR: Memory Manage Fault Status Register Mask */
+
+/* SCB Hard Fault Status Registers Definitions */
+#define SCB_HFSR_DEBUGEVT_Pos 31 /*!< SCB HFSR: DEBUGEVT Position */
+#define SCB_HFSR_DEBUGEVT_Msk (1UL << SCB_HFSR_DEBUGEVT_Pos) /*!< SCB HFSR: DEBUGEVT Mask */
+
+#define SCB_HFSR_FORCED_Pos 30 /*!< SCB HFSR: FORCED Position */
+#define SCB_HFSR_FORCED_Msk (1UL << SCB_HFSR_FORCED_Pos) /*!< SCB HFSR: FORCED Mask */
+
+#define SCB_HFSR_VECTTBL_Pos 1 /*!< SCB HFSR: VECTTBL Position */
+#define SCB_HFSR_VECTTBL_Msk (1UL << SCB_HFSR_VECTTBL_Pos) /*!< SCB HFSR: VECTTBL Mask */
+
+/* SCB Debug Fault Status Register Definitions */
+#define SCB_DFSR_EXTERNAL_Pos 4 /*!< SCB DFSR: EXTERNAL Position */
+#define SCB_DFSR_EXTERNAL_Msk (1UL << SCB_DFSR_EXTERNAL_Pos) /*!< SCB DFSR: EXTERNAL Mask */
+
+#define SCB_DFSR_VCATCH_Pos 3 /*!< SCB DFSR: VCATCH Position */
+#define SCB_DFSR_VCATCH_Msk (1UL << SCB_DFSR_VCATCH_Pos) /*!< SCB DFSR: VCATCH Mask */
+
+#define SCB_DFSR_DWTTRAP_Pos 2 /*!< SCB DFSR: DWTTRAP Position */
+#define SCB_DFSR_DWTTRAP_Msk (1UL << SCB_DFSR_DWTTRAP_Pos) /*!< SCB DFSR: DWTTRAP Mask */
+
+#define SCB_DFSR_BKPT_Pos 1 /*!< SCB DFSR: BKPT Position */
+#define SCB_DFSR_BKPT_Msk (1UL << SCB_DFSR_BKPT_Pos) /*!< SCB DFSR: BKPT Mask */
+
+#define SCB_DFSR_HALTED_Pos 0 /*!< SCB DFSR: HALTED Position */
+#define SCB_DFSR_HALTED_Msk (1UL /*<< SCB_DFSR_HALTED_Pos*/) /*!< SCB DFSR: HALTED Mask */
+
+/*@} end of group CMSIS_SCB */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_SCnSCB System Controls not in SCB (SCnSCB)
+ \brief Type definitions for the System Control and ID Register not in the SCB
+ @{
+ */
+
+/** \brief Structure type to access the System Control and ID Register not in the SCB.
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1];
+ __I uint32_t ICTR; /*!< Offset: 0x004 (R/ ) Interrupt Controller Type Register */
+ __IO uint32_t ACTLR; /*!< Offset: 0x008 (R/W) Auxiliary Control Register */
+} SCnSCB_Type;
+
+/* Interrupt Controller Type Register Definitions */
+#define SCnSCB_ICTR_INTLINESNUM_Pos 0 /*!< ICTR: INTLINESNUM Position */
+#define SCnSCB_ICTR_INTLINESNUM_Msk (0xFUL /*<< SCnSCB_ICTR_INTLINESNUM_Pos*/) /*!< ICTR: INTLINESNUM Mask */
+
+/* Auxiliary Control Register Definitions */
+#define SCnSCB_ACTLR_DISOOFP_Pos 9 /*!< ACTLR: DISOOFP Position */
+#define SCnSCB_ACTLR_DISOOFP_Msk (1UL << SCnSCB_ACTLR_DISOOFP_Pos) /*!< ACTLR: DISOOFP Mask */
+
+#define SCnSCB_ACTLR_DISFPCA_Pos 8 /*!< ACTLR: DISFPCA Position */
+#define SCnSCB_ACTLR_DISFPCA_Msk (1UL << SCnSCB_ACTLR_DISFPCA_Pos) /*!< ACTLR: DISFPCA Mask */
+
+#define SCnSCB_ACTLR_DISFOLD_Pos 2 /*!< ACTLR: DISFOLD Position */
+#define SCnSCB_ACTLR_DISFOLD_Msk (1UL << SCnSCB_ACTLR_DISFOLD_Pos) /*!< ACTLR: DISFOLD Mask */
+
+#define SCnSCB_ACTLR_DISDEFWBUF_Pos 1 /*!< ACTLR: DISDEFWBUF Position */
+#define SCnSCB_ACTLR_DISDEFWBUF_Msk (1UL << SCnSCB_ACTLR_DISDEFWBUF_Pos) /*!< ACTLR: DISDEFWBUF Mask */
+
+#define SCnSCB_ACTLR_DISMCYCINT_Pos 0 /*!< ACTLR: DISMCYCINT Position */
+#define SCnSCB_ACTLR_DISMCYCINT_Msk (1UL /*<< SCnSCB_ACTLR_DISMCYCINT_Pos*/) /*!< ACTLR: DISMCYCINT Mask */
+
+/*@} end of group CMSIS_SCnotSCB */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_SysTick System Tick Timer (SysTick)
+ \brief Type definitions for the System Timer Registers.
+ @{
+ */
+
+/** \brief Structure type to access the System Timer (SysTick).
+ */
+typedef struct
+{
+ __IO uint32_t CTRL; /*!< Offset: 0x000 (R/W) SysTick Control and Status Register */
+ __IO uint32_t LOAD; /*!< Offset: 0x004 (R/W) SysTick Reload Value Register */
+ __IO uint32_t VAL; /*!< Offset: 0x008 (R/W) SysTick Current Value Register */
+ __I uint32_t CALIB; /*!< Offset: 0x00C (R/ ) SysTick Calibration Register */
+} SysTick_Type;
+
+/* SysTick Control / Status Register Definitions */
+#define SysTick_CTRL_COUNTFLAG_Pos 16 /*!< SysTick CTRL: COUNTFLAG Position */
+#define SysTick_CTRL_COUNTFLAG_Msk (1UL << SysTick_CTRL_COUNTFLAG_Pos) /*!< SysTick CTRL: COUNTFLAG Mask */
+
+#define SysTick_CTRL_CLKSOURCE_Pos 2 /*!< SysTick CTRL: CLKSOURCE Position */
+#define SysTick_CTRL_CLKSOURCE_Msk (1UL << SysTick_CTRL_CLKSOURCE_Pos) /*!< SysTick CTRL: CLKSOURCE Mask */
+
+#define SysTick_CTRL_TICKINT_Pos 1 /*!< SysTick CTRL: TICKINT Position */
+#define SysTick_CTRL_TICKINT_Msk (1UL << SysTick_CTRL_TICKINT_Pos) /*!< SysTick CTRL: TICKINT Mask */
+
+#define SysTick_CTRL_ENABLE_Pos 0 /*!< SysTick CTRL: ENABLE Position */
+#define SysTick_CTRL_ENABLE_Msk (1UL /*<< SysTick_CTRL_ENABLE_Pos*/) /*!< SysTick CTRL: ENABLE Mask */
+
+/* SysTick Reload Register Definitions */
+#define SysTick_LOAD_RELOAD_Pos 0 /*!< SysTick LOAD: RELOAD Position */
+#define SysTick_LOAD_RELOAD_Msk (0xFFFFFFUL /*<< SysTick_LOAD_RELOAD_Pos*/) /*!< SysTick LOAD: RELOAD Mask */
+
+/* SysTick Current Register Definitions */
+#define SysTick_VAL_CURRENT_Pos 0 /*!< SysTick VAL: CURRENT Position */
+#define SysTick_VAL_CURRENT_Msk (0xFFFFFFUL /*<< SysTick_VAL_CURRENT_Pos*/) /*!< SysTick VAL: CURRENT Mask */
+
+/* SysTick Calibration Register Definitions */
+#define SysTick_CALIB_NOREF_Pos 31 /*!< SysTick CALIB: NOREF Position */
+#define SysTick_CALIB_NOREF_Msk (1UL << SysTick_CALIB_NOREF_Pos) /*!< SysTick CALIB: NOREF Mask */
+
+#define SysTick_CALIB_SKEW_Pos 30 /*!< SysTick CALIB: SKEW Position */
+#define SysTick_CALIB_SKEW_Msk (1UL << SysTick_CALIB_SKEW_Pos) /*!< SysTick CALIB: SKEW Mask */
+
+#define SysTick_CALIB_TENMS_Pos 0 /*!< SysTick CALIB: TENMS Position */
+#define SysTick_CALIB_TENMS_Msk (0xFFFFFFUL /*<< SysTick_CALIB_TENMS_Pos*/) /*!< SysTick CALIB: TENMS Mask */
+
+/*@} end of group CMSIS_SysTick */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_ITM Instrumentation Trace Macrocell (ITM)
+ \brief Type definitions for the Instrumentation Trace Macrocell (ITM)
+ @{
+ */
+
+/** \brief Structure type to access the Instrumentation Trace Macrocell Register (ITM).
+ */
+typedef struct
+{
+ __O union
+ {
+ __O uint8_t u8; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 8-bit */
+ __O uint16_t u16; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 16-bit */
+ __O uint32_t u32; /*!< Offset: 0x000 ( /W) ITM Stimulus Port 32-bit */
+ } PORT [32]; /*!< Offset: 0x000 ( /W) ITM Stimulus Port Registers */
+ uint32_t RESERVED0[864];
+ __IO uint32_t TER; /*!< Offset: 0xE00 (R/W) ITM Trace Enable Register */
+ uint32_t RESERVED1[15];
+ __IO uint32_t TPR; /*!< Offset: 0xE40 (R/W) ITM Trace Privilege Register */
+ uint32_t RESERVED2[15];
+ __IO uint32_t TCR; /*!< Offset: 0xE80 (R/W) ITM Trace Control Register */
+ uint32_t RESERVED3[29];
+ __O uint32_t IWR; /*!< Offset: 0xEF8 ( /W) ITM Integration Write Register */
+ __I uint32_t IRR; /*!< Offset: 0xEFC (R/ ) ITM Integration Read Register */
+ __IO uint32_t IMCR; /*!< Offset: 0xF00 (R/W) ITM Integration Mode Control Register */
+ uint32_t RESERVED4[43];
+ __O uint32_t LAR; /*!< Offset: 0xFB0 ( /W) ITM Lock Access Register */
+ __I uint32_t LSR; /*!< Offset: 0xFB4 (R/ ) ITM Lock Status Register */
+ uint32_t RESERVED5[6];
+ __I uint32_t PID4; /*!< Offset: 0xFD0 (R/ ) ITM Peripheral Identification Register #4 */
+ __I uint32_t PID5; /*!< Offset: 0xFD4 (R/ ) ITM Peripheral Identification Register #5 */
+ __I uint32_t PID6; /*!< Offset: 0xFD8 (R/ ) ITM Peripheral Identification Register #6 */
+ __I uint32_t PID7; /*!< Offset: 0xFDC (R/ ) ITM Peripheral Identification Register #7 */
+ __I uint32_t PID0; /*!< Offset: 0xFE0 (R/ ) ITM Peripheral Identification Register #0 */
+ __I uint32_t PID1; /*!< Offset: 0xFE4 (R/ ) ITM Peripheral Identification Register #1 */
+ __I uint32_t PID2; /*!< Offset: 0xFE8 (R/ ) ITM Peripheral Identification Register #2 */
+ __I uint32_t PID3; /*!< Offset: 0xFEC (R/ ) ITM Peripheral Identification Register #3 */
+ __I uint32_t CID0; /*!< Offset: 0xFF0 (R/ ) ITM Component Identification Register #0 */
+ __I uint32_t CID1; /*!< Offset: 0xFF4 (R/ ) ITM Component Identification Register #1 */
+ __I uint32_t CID2; /*!< Offset: 0xFF8 (R/ ) ITM Component Identification Register #2 */
+ __I uint32_t CID3; /*!< Offset: 0xFFC (R/ ) ITM Component Identification Register #3 */
+} ITM_Type;
+
+/* ITM Trace Privilege Register Definitions */
+#define ITM_TPR_PRIVMASK_Pos 0 /*!< ITM TPR: PRIVMASK Position */
+#define ITM_TPR_PRIVMASK_Msk (0xFUL /*<< ITM_TPR_PRIVMASK_Pos*/) /*!< ITM TPR: PRIVMASK Mask */
+
+/* ITM Trace Control Register Definitions */
+#define ITM_TCR_BUSY_Pos 23 /*!< ITM TCR: BUSY Position */
+#define ITM_TCR_BUSY_Msk (1UL << ITM_TCR_BUSY_Pos) /*!< ITM TCR: BUSY Mask */
+
+#define ITM_TCR_TraceBusID_Pos 16 /*!< ITM TCR: ATBID Position */
+#define ITM_TCR_TraceBusID_Msk (0x7FUL << ITM_TCR_TraceBusID_Pos) /*!< ITM TCR: ATBID Mask */
+
+#define ITM_TCR_GTSFREQ_Pos 10 /*!< ITM TCR: Global timestamp frequency Position */
+#define ITM_TCR_GTSFREQ_Msk (3UL << ITM_TCR_GTSFREQ_Pos) /*!< ITM TCR: Global timestamp frequency Mask */
+
+#define ITM_TCR_TSPrescale_Pos 8 /*!< ITM TCR: TSPrescale Position */
+#define ITM_TCR_TSPrescale_Msk (3UL << ITM_TCR_TSPrescale_Pos) /*!< ITM TCR: TSPrescale Mask */
+
+#define ITM_TCR_SWOENA_Pos 4 /*!< ITM TCR: SWOENA Position */
+#define ITM_TCR_SWOENA_Msk (1UL << ITM_TCR_SWOENA_Pos) /*!< ITM TCR: SWOENA Mask */
+
+#define ITM_TCR_DWTENA_Pos 3 /*!< ITM TCR: DWTENA Position */
+#define ITM_TCR_DWTENA_Msk (1UL << ITM_TCR_DWTENA_Pos) /*!< ITM TCR: DWTENA Mask */
+
+#define ITM_TCR_SYNCENA_Pos 2 /*!< ITM TCR: SYNCENA Position */
+#define ITM_TCR_SYNCENA_Msk (1UL << ITM_TCR_SYNCENA_Pos) /*!< ITM TCR: SYNCENA Mask */
+
+#define ITM_TCR_TSENA_Pos 1 /*!< ITM TCR: TSENA Position */
+#define ITM_TCR_TSENA_Msk (1UL << ITM_TCR_TSENA_Pos) /*!< ITM TCR: TSENA Mask */
+
+#define ITM_TCR_ITMENA_Pos 0 /*!< ITM TCR: ITM Enable bit Position */
+#define ITM_TCR_ITMENA_Msk (1UL /*<< ITM_TCR_ITMENA_Pos*/) /*!< ITM TCR: ITM Enable bit Mask */
+
+/* ITM Integration Write Register Definitions */
+#define ITM_IWR_ATVALIDM_Pos 0 /*!< ITM IWR: ATVALIDM Position */
+#define ITM_IWR_ATVALIDM_Msk (1UL /*<< ITM_IWR_ATVALIDM_Pos*/) /*!< ITM IWR: ATVALIDM Mask */
+
+/* ITM Integration Read Register Definitions */
+#define ITM_IRR_ATREADYM_Pos 0 /*!< ITM IRR: ATREADYM Position */
+#define ITM_IRR_ATREADYM_Msk (1UL /*<< ITM_IRR_ATREADYM_Pos*/) /*!< ITM IRR: ATREADYM Mask */
+
+/* ITM Integration Mode Control Register Definitions */
+#define ITM_IMCR_INTEGRATION_Pos 0 /*!< ITM IMCR: INTEGRATION Position */
+#define ITM_IMCR_INTEGRATION_Msk (1UL /*<< ITM_IMCR_INTEGRATION_Pos*/) /*!< ITM IMCR: INTEGRATION Mask */
+
+/* ITM Lock Status Register Definitions */
+#define ITM_LSR_ByteAcc_Pos 2 /*!< ITM LSR: ByteAcc Position */
+#define ITM_LSR_ByteAcc_Msk (1UL << ITM_LSR_ByteAcc_Pos) /*!< ITM LSR: ByteAcc Mask */
+
+#define ITM_LSR_Access_Pos 1 /*!< ITM LSR: Access Position */
+#define ITM_LSR_Access_Msk (1UL << ITM_LSR_Access_Pos) /*!< ITM LSR: Access Mask */
+
+#define ITM_LSR_Present_Pos 0 /*!< ITM LSR: Present Position */
+#define ITM_LSR_Present_Msk (1UL /*<< ITM_LSR_Present_Pos*/) /*!< ITM LSR: Present Mask */
+
+/*@}*/ /* end of group CMSIS_ITM */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_DWT Data Watchpoint and Trace (DWT)
+ \brief Type definitions for the Data Watchpoint and Trace (DWT)
+ @{
+ */
+
+/** \brief Structure type to access the Data Watchpoint and Trace Register (DWT).
+ */
+typedef struct
+{
+ __IO uint32_t CTRL; /*!< Offset: 0x000 (R/W) Control Register */
+ __IO uint32_t CYCCNT; /*!< Offset: 0x004 (R/W) Cycle Count Register */
+ __IO uint32_t CPICNT; /*!< Offset: 0x008 (R/W) CPI Count Register */
+ __IO uint32_t EXCCNT; /*!< Offset: 0x00C (R/W) Exception Overhead Count Register */
+ __IO uint32_t SLEEPCNT; /*!< Offset: 0x010 (R/W) Sleep Count Register */
+ __IO uint32_t LSUCNT; /*!< Offset: 0x014 (R/W) LSU Count Register */
+ __IO uint32_t FOLDCNT; /*!< Offset: 0x018 (R/W) Folded-instruction Count Register */
+ __I uint32_t PCSR; /*!< Offset: 0x01C (R/ ) Program Counter Sample Register */
+ __IO uint32_t COMP0; /*!< Offset: 0x020 (R/W) Comparator Register 0 */
+ __IO uint32_t MASK0; /*!< Offset: 0x024 (R/W) Mask Register 0 */
+ __IO uint32_t FUNCTION0; /*!< Offset: 0x028 (R/W) Function Register 0 */
+ uint32_t RESERVED0[1];
+ __IO uint32_t COMP1; /*!< Offset: 0x030 (R/W) Comparator Register 1 */
+ __IO uint32_t MASK1; /*!< Offset: 0x034 (R/W) Mask Register 1 */
+ __IO uint32_t FUNCTION1; /*!< Offset: 0x038 (R/W) Function Register 1 */
+ uint32_t RESERVED1[1];
+ __IO uint32_t COMP2; /*!< Offset: 0x040 (R/W) Comparator Register 2 */
+ __IO uint32_t MASK2; /*!< Offset: 0x044 (R/W) Mask Register 2 */
+ __IO uint32_t FUNCTION2; /*!< Offset: 0x048 (R/W) Function Register 2 */
+ uint32_t RESERVED2[1];
+ __IO uint32_t COMP3; /*!< Offset: 0x050 (R/W) Comparator Register 3 */
+ __IO uint32_t MASK3; /*!< Offset: 0x054 (R/W) Mask Register 3 */
+ __IO uint32_t FUNCTION3; /*!< Offset: 0x058 (R/W) Function Register 3 */
+} DWT_Type;
+
+/* DWT Control Register Definitions */
+#define DWT_CTRL_NUMCOMP_Pos 28 /*!< DWT CTRL: NUMCOMP Position */
+#define DWT_CTRL_NUMCOMP_Msk (0xFUL << DWT_CTRL_NUMCOMP_Pos) /*!< DWT CTRL: NUMCOMP Mask */
+
+#define DWT_CTRL_NOTRCPKT_Pos 27 /*!< DWT CTRL: NOTRCPKT Position */
+#define DWT_CTRL_NOTRCPKT_Msk (0x1UL << DWT_CTRL_NOTRCPKT_Pos) /*!< DWT CTRL: NOTRCPKT Mask */
+
+#define DWT_CTRL_NOEXTTRIG_Pos 26 /*!< DWT CTRL: NOEXTTRIG Position */
+#define DWT_CTRL_NOEXTTRIG_Msk (0x1UL << DWT_CTRL_NOEXTTRIG_Pos) /*!< DWT CTRL: NOEXTTRIG Mask */
+
+#define DWT_CTRL_NOCYCCNT_Pos 25 /*!< DWT CTRL: NOCYCCNT Position */
+#define DWT_CTRL_NOCYCCNT_Msk (0x1UL << DWT_CTRL_NOCYCCNT_Pos) /*!< DWT CTRL: NOCYCCNT Mask */
+
+#define DWT_CTRL_NOPRFCNT_Pos 24 /*!< DWT CTRL: NOPRFCNT Position */
+#define DWT_CTRL_NOPRFCNT_Msk (0x1UL << DWT_CTRL_NOPRFCNT_Pos) /*!< DWT CTRL: NOPRFCNT Mask */
+
+#define DWT_CTRL_CYCEVTENA_Pos 22 /*!< DWT CTRL: CYCEVTENA Position */
+#define DWT_CTRL_CYCEVTENA_Msk (0x1UL << DWT_CTRL_CYCEVTENA_Pos) /*!< DWT CTRL: CYCEVTENA Mask */
+
+#define DWT_CTRL_FOLDEVTENA_Pos 21 /*!< DWT CTRL: FOLDEVTENA Position */
+#define DWT_CTRL_FOLDEVTENA_Msk (0x1UL << DWT_CTRL_FOLDEVTENA_Pos) /*!< DWT CTRL: FOLDEVTENA Mask */
+
+#define DWT_CTRL_LSUEVTENA_Pos 20 /*!< DWT CTRL: LSUEVTENA Position */
+#define DWT_CTRL_LSUEVTENA_Msk (0x1UL << DWT_CTRL_LSUEVTENA_Pos) /*!< DWT CTRL: LSUEVTENA Mask */
+
+#define DWT_CTRL_SLEEPEVTENA_Pos 19 /*!< DWT CTRL: SLEEPEVTENA Position */
+#define DWT_CTRL_SLEEPEVTENA_Msk (0x1UL << DWT_CTRL_SLEEPEVTENA_Pos) /*!< DWT CTRL: SLEEPEVTENA Mask */
+
+#define DWT_CTRL_EXCEVTENA_Pos 18 /*!< DWT CTRL: EXCEVTENA Position */
+#define DWT_CTRL_EXCEVTENA_Msk (0x1UL << DWT_CTRL_EXCEVTENA_Pos) /*!< DWT CTRL: EXCEVTENA Mask */
+
+#define DWT_CTRL_CPIEVTENA_Pos 17 /*!< DWT CTRL: CPIEVTENA Position */
+#define DWT_CTRL_CPIEVTENA_Msk (0x1UL << DWT_CTRL_CPIEVTENA_Pos) /*!< DWT CTRL: CPIEVTENA Mask */
+
+#define DWT_CTRL_EXCTRCENA_Pos 16 /*!< DWT CTRL: EXCTRCENA Position */
+#define DWT_CTRL_EXCTRCENA_Msk (0x1UL << DWT_CTRL_EXCTRCENA_Pos) /*!< DWT CTRL: EXCTRCENA Mask */
+
+#define DWT_CTRL_PCSAMPLENA_Pos 12 /*!< DWT CTRL: PCSAMPLENA Position */
+#define DWT_CTRL_PCSAMPLENA_Msk (0x1UL << DWT_CTRL_PCSAMPLENA_Pos) /*!< DWT CTRL: PCSAMPLENA Mask */
+
+#define DWT_CTRL_SYNCTAP_Pos 10 /*!< DWT CTRL: SYNCTAP Position */
+#define DWT_CTRL_SYNCTAP_Msk (0x3UL << DWT_CTRL_SYNCTAP_Pos) /*!< DWT CTRL: SYNCTAP Mask */
+
+#define DWT_CTRL_CYCTAP_Pos 9 /*!< DWT CTRL: CYCTAP Position */
+#define DWT_CTRL_CYCTAP_Msk (0x1UL << DWT_CTRL_CYCTAP_Pos) /*!< DWT CTRL: CYCTAP Mask */
+
+#define DWT_CTRL_POSTINIT_Pos 5 /*!< DWT CTRL: POSTINIT Position */
+#define DWT_CTRL_POSTINIT_Msk (0xFUL << DWT_CTRL_POSTINIT_Pos) /*!< DWT CTRL: POSTINIT Mask */
+
+#define DWT_CTRL_POSTPRESET_Pos 1 /*!< DWT CTRL: POSTPRESET Position */
+#define DWT_CTRL_POSTPRESET_Msk (0xFUL << DWT_CTRL_POSTPRESET_Pos) /*!< DWT CTRL: POSTPRESET Mask */
+
+#define DWT_CTRL_CYCCNTENA_Pos 0 /*!< DWT CTRL: CYCCNTENA Position */
+#define DWT_CTRL_CYCCNTENA_Msk (0x1UL /*<< DWT_CTRL_CYCCNTENA_Pos*/) /*!< DWT CTRL: CYCCNTENA Mask */
+
+/* DWT CPI Count Register Definitions */
+#define DWT_CPICNT_CPICNT_Pos 0 /*!< DWT CPICNT: CPICNT Position */
+#define DWT_CPICNT_CPICNT_Msk (0xFFUL /*<< DWT_CPICNT_CPICNT_Pos*/) /*!< DWT CPICNT: CPICNT Mask */
+
+/* DWT Exception Overhead Count Register Definitions */
+#define DWT_EXCCNT_EXCCNT_Pos 0 /*!< DWT EXCCNT: EXCCNT Position */
+#define DWT_EXCCNT_EXCCNT_Msk (0xFFUL /*<< DWT_EXCCNT_EXCCNT_Pos*/) /*!< DWT EXCCNT: EXCCNT Mask */
+
+/* DWT Sleep Count Register Definitions */
+#define DWT_SLEEPCNT_SLEEPCNT_Pos 0 /*!< DWT SLEEPCNT: SLEEPCNT Position */
+#define DWT_SLEEPCNT_SLEEPCNT_Msk (0xFFUL /*<< DWT_SLEEPCNT_SLEEPCNT_Pos*/) /*!< DWT SLEEPCNT: SLEEPCNT Mask */
+
+/* DWT LSU Count Register Definitions */
+#define DWT_LSUCNT_LSUCNT_Pos 0 /*!< DWT LSUCNT: LSUCNT Position */
+#define DWT_LSUCNT_LSUCNT_Msk (0xFFUL /*<< DWT_LSUCNT_LSUCNT_Pos*/) /*!< DWT LSUCNT: LSUCNT Mask */
+
+/* DWT Folded-instruction Count Register Definitions */
+#define DWT_FOLDCNT_FOLDCNT_Pos 0 /*!< DWT FOLDCNT: FOLDCNT Position */
+#define DWT_FOLDCNT_FOLDCNT_Msk (0xFFUL /*<< DWT_FOLDCNT_FOLDCNT_Pos*/) /*!< DWT FOLDCNT: FOLDCNT Mask */
+
+/* DWT Comparator Mask Register Definitions */
+#define DWT_MASK_MASK_Pos 0 /*!< DWT MASK: MASK Position */
+#define DWT_MASK_MASK_Msk (0x1FUL /*<< DWT_MASK_MASK_Pos*/) /*!< DWT MASK: MASK Mask */
+
+/* DWT Comparator Function Register Definitions */
+#define DWT_FUNCTION_MATCHED_Pos 24 /*!< DWT FUNCTION: MATCHED Position */
+#define DWT_FUNCTION_MATCHED_Msk (0x1UL << DWT_FUNCTION_MATCHED_Pos) /*!< DWT FUNCTION: MATCHED Mask */
+
+#define DWT_FUNCTION_DATAVADDR1_Pos 16 /*!< DWT FUNCTION: DATAVADDR1 Position */
+#define DWT_FUNCTION_DATAVADDR1_Msk (0xFUL << DWT_FUNCTION_DATAVADDR1_Pos) /*!< DWT FUNCTION: DATAVADDR1 Mask */
+
+#define DWT_FUNCTION_DATAVADDR0_Pos 12 /*!< DWT FUNCTION: DATAVADDR0 Position */
+#define DWT_FUNCTION_DATAVADDR0_Msk (0xFUL << DWT_FUNCTION_DATAVADDR0_Pos) /*!< DWT FUNCTION: DATAVADDR0 Mask */
+
+#define DWT_FUNCTION_DATAVSIZE_Pos 10 /*!< DWT FUNCTION: DATAVSIZE Position */
+#define DWT_FUNCTION_DATAVSIZE_Msk (0x3UL << DWT_FUNCTION_DATAVSIZE_Pos) /*!< DWT FUNCTION: DATAVSIZE Mask */
+
+#define DWT_FUNCTION_LNK1ENA_Pos 9 /*!< DWT FUNCTION: LNK1ENA Position */
+#define DWT_FUNCTION_LNK1ENA_Msk (0x1UL << DWT_FUNCTION_LNK1ENA_Pos) /*!< DWT FUNCTION: LNK1ENA Mask */
+
+#define DWT_FUNCTION_DATAVMATCH_Pos 8 /*!< DWT FUNCTION: DATAVMATCH Position */
+#define DWT_FUNCTION_DATAVMATCH_Msk (0x1UL << DWT_FUNCTION_DATAVMATCH_Pos) /*!< DWT FUNCTION: DATAVMATCH Mask */
+
+#define DWT_FUNCTION_CYCMATCH_Pos 7 /*!< DWT FUNCTION: CYCMATCH Position */
+#define DWT_FUNCTION_CYCMATCH_Msk (0x1UL << DWT_FUNCTION_CYCMATCH_Pos) /*!< DWT FUNCTION: CYCMATCH Mask */
+
+#define DWT_FUNCTION_EMITRANGE_Pos 5 /*!< DWT FUNCTION: EMITRANGE Position */
+#define DWT_FUNCTION_EMITRANGE_Msk (0x1UL << DWT_FUNCTION_EMITRANGE_Pos) /*!< DWT FUNCTION: EMITRANGE Mask */
+
+#define DWT_FUNCTION_FUNCTION_Pos 0 /*!< DWT FUNCTION: FUNCTION Position */
+#define DWT_FUNCTION_FUNCTION_Msk (0xFUL /*<< DWT_FUNCTION_FUNCTION_Pos*/) /*!< DWT FUNCTION: FUNCTION Mask */
+
+/*@}*/ /* end of group CMSIS_DWT */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_TPI Trace Port Interface (TPI)
+ \brief Type definitions for the Trace Port Interface (TPI)
+ @{
+ */
+
+/** \brief Structure type to access the Trace Port Interface Register (TPI).
+ */
+typedef struct
+{
+ __IO uint32_t SSPSR; /*!< Offset: 0x000 (R/ ) Supported Parallel Port Size Register */
+ __IO uint32_t CSPSR; /*!< Offset: 0x004 (R/W) Current Parallel Port Size Register */
+ uint32_t RESERVED0[2];
+ __IO uint32_t ACPR; /*!< Offset: 0x010 (R/W) Asynchronous Clock Prescaler Register */
+ uint32_t RESERVED1[55];
+ __IO uint32_t SPPR; /*!< Offset: 0x0F0 (R/W) Selected Pin Protocol Register */
+ uint32_t RESERVED2[131];
+ __I uint32_t FFSR; /*!< Offset: 0x300 (R/ ) Formatter and Flush Status Register */
+ __IO uint32_t FFCR; /*!< Offset: 0x304 (R/W) Formatter and Flush Control Register */
+ __I uint32_t FSCR; /*!< Offset: 0x308 (R/ ) Formatter Synchronization Counter Register */
+ uint32_t RESERVED3[759];
+ __I uint32_t TRIGGER; /*!< Offset: 0xEE8 (R/ ) TRIGGER */
+ __I uint32_t FIFO0; /*!< Offset: 0xEEC (R/ ) Integration ETM Data */
+ __I uint32_t ITATBCTR2; /*!< Offset: 0xEF0 (R/ ) ITATBCTR2 */
+ uint32_t RESERVED4[1];
+ __I uint32_t ITATBCTR0; /*!< Offset: 0xEF8 (R/ ) ITATBCTR0 */
+ __I uint32_t FIFO1; /*!< Offset: 0xEFC (R/ ) Integration ITM Data */
+ __IO uint32_t ITCTRL; /*!< Offset: 0xF00 (R/W) Integration Mode Control */
+ uint32_t RESERVED5[39];
+ __IO uint32_t CLAIMSET; /*!< Offset: 0xFA0 (R/W) Claim tag set */
+ __IO uint32_t CLAIMCLR; /*!< Offset: 0xFA4 (R/W) Claim tag clear */
+ uint32_t RESERVED7[8];
+ __I uint32_t DEVID; /*!< Offset: 0xFC8 (R/ ) TPIU_DEVID */
+ __I uint32_t DEVTYPE; /*!< Offset: 0xFCC (R/ ) TPIU_DEVTYPE */
+} TPI_Type;
+
+/* TPI Asynchronous Clock Prescaler Register Definitions */
+#define TPI_ACPR_PRESCALER_Pos 0 /*!< TPI ACPR: PRESCALER Position */
+#define TPI_ACPR_PRESCALER_Msk (0x1FFFUL /*<< TPI_ACPR_PRESCALER_Pos*/) /*!< TPI ACPR: PRESCALER Mask */
+
+/* TPI Selected Pin Protocol Register Definitions */
+#define TPI_SPPR_TXMODE_Pos 0 /*!< TPI SPPR: TXMODE Position */
+#define TPI_SPPR_TXMODE_Msk (0x3UL /*<< TPI_SPPR_TXMODE_Pos*/) /*!< TPI SPPR: TXMODE Mask */
+
+/* TPI Formatter and Flush Status Register Definitions */
+#define TPI_FFSR_FtNonStop_Pos 3 /*!< TPI FFSR: FtNonStop Position */
+#define TPI_FFSR_FtNonStop_Msk (0x1UL << TPI_FFSR_FtNonStop_Pos) /*!< TPI FFSR: FtNonStop Mask */
+
+#define TPI_FFSR_TCPresent_Pos 2 /*!< TPI FFSR: TCPresent Position */
+#define TPI_FFSR_TCPresent_Msk (0x1UL << TPI_FFSR_TCPresent_Pos) /*!< TPI FFSR: TCPresent Mask */
+
+#define TPI_FFSR_FtStopped_Pos 1 /*!< TPI FFSR: FtStopped Position */
+#define TPI_FFSR_FtStopped_Msk (0x1UL << TPI_FFSR_FtStopped_Pos) /*!< TPI FFSR: FtStopped Mask */
+
+#define TPI_FFSR_FlInProg_Pos 0 /*!< TPI FFSR: FlInProg Position */
+#define TPI_FFSR_FlInProg_Msk (0x1UL /*<< TPI_FFSR_FlInProg_Pos*/) /*!< TPI FFSR: FlInProg Mask */
+
+/* TPI Formatter and Flush Control Register Definitions */
+#define TPI_FFCR_TrigIn_Pos 8 /*!< TPI FFCR: TrigIn Position */
+#define TPI_FFCR_TrigIn_Msk (0x1UL << TPI_FFCR_TrigIn_Pos) /*!< TPI FFCR: TrigIn Mask */
+
+#define TPI_FFCR_EnFCont_Pos 1 /*!< TPI FFCR: EnFCont Position */
+#define TPI_FFCR_EnFCont_Msk (0x1UL << TPI_FFCR_EnFCont_Pos) /*!< TPI FFCR: EnFCont Mask */
+
+/* TPI TRIGGER Register Definitions */
+#define TPI_TRIGGER_TRIGGER_Pos 0 /*!< TPI TRIGGER: TRIGGER Position */
+#define TPI_TRIGGER_TRIGGER_Msk (0x1UL /*<< TPI_TRIGGER_TRIGGER_Pos*/) /*!< TPI TRIGGER: TRIGGER Mask */
+
+/* TPI Integration ETM Data Register Definitions (FIFO0) */
+#define TPI_FIFO0_ITM_ATVALID_Pos 29 /*!< TPI FIFO0: ITM_ATVALID Position */
+#define TPI_FIFO0_ITM_ATVALID_Msk (0x3UL << TPI_FIFO0_ITM_ATVALID_Pos) /*!< TPI FIFO0: ITM_ATVALID Mask */
+
+#define TPI_FIFO0_ITM_bytecount_Pos 27 /*!< TPI FIFO0: ITM_bytecount Position */
+#define TPI_FIFO0_ITM_bytecount_Msk (0x3UL << TPI_FIFO0_ITM_bytecount_Pos) /*!< TPI FIFO0: ITM_bytecount Mask */
+
+#define TPI_FIFO0_ETM_ATVALID_Pos 26 /*!< TPI FIFO0: ETM_ATVALID Position */
+#define TPI_FIFO0_ETM_ATVALID_Msk (0x3UL << TPI_FIFO0_ETM_ATVALID_Pos) /*!< TPI FIFO0: ETM_ATVALID Mask */
+
+#define TPI_FIFO0_ETM_bytecount_Pos 24 /*!< TPI FIFO0: ETM_bytecount Position */
+#define TPI_FIFO0_ETM_bytecount_Msk (0x3UL << TPI_FIFO0_ETM_bytecount_Pos) /*!< TPI FIFO0: ETM_bytecount Mask */
+
+#define TPI_FIFO0_ETM2_Pos 16 /*!< TPI FIFO0: ETM2 Position */
+#define TPI_FIFO0_ETM2_Msk (0xFFUL << TPI_FIFO0_ETM2_Pos) /*!< TPI FIFO0: ETM2 Mask */
+
+#define TPI_FIFO0_ETM1_Pos 8 /*!< TPI FIFO0: ETM1 Position */
+#define TPI_FIFO0_ETM1_Msk (0xFFUL << TPI_FIFO0_ETM1_Pos) /*!< TPI FIFO0: ETM1 Mask */
+
+#define TPI_FIFO0_ETM0_Pos 0 /*!< TPI FIFO0: ETM0 Position */
+#define TPI_FIFO0_ETM0_Msk (0xFFUL /*<< TPI_FIFO0_ETM0_Pos*/) /*!< TPI FIFO0: ETM0 Mask */
+
+/* TPI ITATBCTR2 Register Definitions */
+#define TPI_ITATBCTR2_ATREADY_Pos 0 /*!< TPI ITATBCTR2: ATREADY Position */
+#define TPI_ITATBCTR2_ATREADY_Msk (0x1UL /*<< TPI_ITATBCTR2_ATREADY_Pos*/) /*!< TPI ITATBCTR2: ATREADY Mask */
+
+/* TPI Integration ITM Data Register Definitions (FIFO1) */
+#define TPI_FIFO1_ITM_ATVALID_Pos 29 /*!< TPI FIFO1: ITM_ATVALID Position */
+#define TPI_FIFO1_ITM_ATVALID_Msk (0x3UL << TPI_FIFO1_ITM_ATVALID_Pos) /*!< TPI FIFO1: ITM_ATVALID Mask */
+
+#define TPI_FIFO1_ITM_bytecount_Pos 27 /*!< TPI FIFO1: ITM_bytecount Position */
+#define TPI_FIFO1_ITM_bytecount_Msk (0x3UL << TPI_FIFO1_ITM_bytecount_Pos) /*!< TPI FIFO1: ITM_bytecount Mask */
+
+#define TPI_FIFO1_ETM_ATVALID_Pos 26 /*!< TPI FIFO1: ETM_ATVALID Position */
+#define TPI_FIFO1_ETM_ATVALID_Msk (0x3UL << TPI_FIFO1_ETM_ATVALID_Pos) /*!< TPI FIFO1: ETM_ATVALID Mask */
+
+#define TPI_FIFO1_ETM_bytecount_Pos 24 /*!< TPI FIFO1: ETM_bytecount Position */
+#define TPI_FIFO1_ETM_bytecount_Msk (0x3UL << TPI_FIFO1_ETM_bytecount_Pos) /*!< TPI FIFO1: ETM_bytecount Mask */
+
+#define TPI_FIFO1_ITM2_Pos 16 /*!< TPI FIFO1: ITM2 Position */
+#define TPI_FIFO1_ITM2_Msk (0xFFUL << TPI_FIFO1_ITM2_Pos) /*!< TPI FIFO1: ITM2 Mask */
+
+#define TPI_FIFO1_ITM1_Pos 8 /*!< TPI FIFO1: ITM1 Position */
+#define TPI_FIFO1_ITM1_Msk (0xFFUL << TPI_FIFO1_ITM1_Pos) /*!< TPI FIFO1: ITM1 Mask */
+
+#define TPI_FIFO1_ITM0_Pos 0 /*!< TPI FIFO1: ITM0 Position */
+#define TPI_FIFO1_ITM0_Msk (0xFFUL /*<< TPI_FIFO1_ITM0_Pos*/) /*!< TPI FIFO1: ITM0 Mask */
+
+/* TPI ITATBCTR0 Register Definitions */
+#define TPI_ITATBCTR0_ATREADY_Pos 0 /*!< TPI ITATBCTR0: ATREADY Position */
+#define TPI_ITATBCTR0_ATREADY_Msk (0x1UL /*<< TPI_ITATBCTR0_ATREADY_Pos*/) /*!< TPI ITATBCTR0: ATREADY Mask */
+
+/* TPI Integration Mode Control Register Definitions */
+#define TPI_ITCTRL_Mode_Pos 0 /*!< TPI ITCTRL: Mode Position */
+#define TPI_ITCTRL_Mode_Msk (0x1UL /*<< TPI_ITCTRL_Mode_Pos*/) /*!< TPI ITCTRL: Mode Mask */
+
+/* TPI DEVID Register Definitions */
+#define TPI_DEVID_NRZVALID_Pos 11 /*!< TPI DEVID: NRZVALID Position */
+#define TPI_DEVID_NRZVALID_Msk (0x1UL << TPI_DEVID_NRZVALID_Pos) /*!< TPI DEVID: NRZVALID Mask */
+
+#define TPI_DEVID_MANCVALID_Pos 10 /*!< TPI DEVID: MANCVALID Position */
+#define TPI_DEVID_MANCVALID_Msk (0x1UL << TPI_DEVID_MANCVALID_Pos) /*!< TPI DEVID: MANCVALID Mask */
+
+#define TPI_DEVID_PTINVALID_Pos 9 /*!< TPI DEVID: PTINVALID Position */
+#define TPI_DEVID_PTINVALID_Msk (0x1UL << TPI_DEVID_PTINVALID_Pos) /*!< TPI DEVID: PTINVALID Mask */
+
+#define TPI_DEVID_MinBufSz_Pos 6 /*!< TPI DEVID: MinBufSz Position */
+#define TPI_DEVID_MinBufSz_Msk (0x7UL << TPI_DEVID_MinBufSz_Pos) /*!< TPI DEVID: MinBufSz Mask */
+
+#define TPI_DEVID_AsynClkIn_Pos 5 /*!< TPI DEVID: AsynClkIn Position */
+#define TPI_DEVID_AsynClkIn_Msk (0x1UL << TPI_DEVID_AsynClkIn_Pos) /*!< TPI DEVID: AsynClkIn Mask */
+
+#define TPI_DEVID_NrTraceInput_Pos 0 /*!< TPI DEVID: NrTraceInput Position */
+#define TPI_DEVID_NrTraceInput_Msk (0x1FUL /*<< TPI_DEVID_NrTraceInput_Pos*/) /*!< TPI DEVID: NrTraceInput Mask */
+
+/* TPI DEVTYPE Register Definitions */
+#define TPI_DEVTYPE_MajorType_Pos 4 /*!< TPI DEVTYPE: MajorType Position */
+#define TPI_DEVTYPE_MajorType_Msk (0xFUL << TPI_DEVTYPE_MajorType_Pos) /*!< TPI DEVTYPE: MajorType Mask */
+
+#define TPI_DEVTYPE_SubType_Pos 0 /*!< TPI DEVTYPE: SubType Position */
+#define TPI_DEVTYPE_SubType_Msk (0xFUL /*<< TPI_DEVTYPE_SubType_Pos*/) /*!< TPI DEVTYPE: SubType Mask */
+
+/*@}*/ /* end of group CMSIS_TPI */
+
+
+#if (__MPU_PRESENT == 1)
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_MPU Memory Protection Unit (MPU)
+ \brief Type definitions for the Memory Protection Unit (MPU)
+ @{
+ */
+
+/** \brief Structure type to access the Memory Protection Unit (MPU).
+ */
+typedef struct
+{
+ __I uint32_t TYPE; /*!< Offset: 0x000 (R/ ) MPU Type Register */
+ __IO uint32_t CTRL; /*!< Offset: 0x004 (R/W) MPU Control Register */
+ __IO uint32_t RNR; /*!< Offset: 0x008 (R/W) MPU Region RNRber Register */
+ __IO uint32_t RBAR; /*!< Offset: 0x00C (R/W) MPU Region Base Address Register */
+ __IO uint32_t RASR; /*!< Offset: 0x010 (R/W) MPU Region Attribute and Size Register */
+ __IO uint32_t RBAR_A1; /*!< Offset: 0x014 (R/W) MPU Alias 1 Region Base Address Register */
+ __IO uint32_t RASR_A1; /*!< Offset: 0x018 (R/W) MPU Alias 1 Region Attribute and Size Register */
+ __IO uint32_t RBAR_A2; /*!< Offset: 0x01C (R/W) MPU Alias 2 Region Base Address Register */
+ __IO uint32_t RASR_A2; /*!< Offset: 0x020 (R/W) MPU Alias 2 Region Attribute and Size Register */
+ __IO uint32_t RBAR_A3; /*!< Offset: 0x024 (R/W) MPU Alias 3 Region Base Address Register */
+ __IO uint32_t RASR_A3; /*!< Offset: 0x028 (R/W) MPU Alias 3 Region Attribute and Size Register */
+} MPU_Type;
+
+/* MPU Type Register */
+#define MPU_TYPE_IREGION_Pos 16 /*!< MPU TYPE: IREGION Position */
+#define MPU_TYPE_IREGION_Msk (0xFFUL << MPU_TYPE_IREGION_Pos) /*!< MPU TYPE: IREGION Mask */
+
+#define MPU_TYPE_DREGION_Pos 8 /*!< MPU TYPE: DREGION Position */
+#define MPU_TYPE_DREGION_Msk (0xFFUL << MPU_TYPE_DREGION_Pos) /*!< MPU TYPE: DREGION Mask */
+
+#define MPU_TYPE_SEPARATE_Pos 0 /*!< MPU TYPE: SEPARATE Position */
+#define MPU_TYPE_SEPARATE_Msk (1UL /*<< MPU_TYPE_SEPARATE_Pos*/) /*!< MPU TYPE: SEPARATE Mask */
+
+/* MPU Control Register */
+#define MPU_CTRL_PRIVDEFENA_Pos 2 /*!< MPU CTRL: PRIVDEFENA Position */
+#define MPU_CTRL_PRIVDEFENA_Msk (1UL << MPU_CTRL_PRIVDEFENA_Pos) /*!< MPU CTRL: PRIVDEFENA Mask */
+
+#define MPU_CTRL_HFNMIENA_Pos 1 /*!< MPU CTRL: HFNMIENA Position */
+#define MPU_CTRL_HFNMIENA_Msk (1UL << MPU_CTRL_HFNMIENA_Pos) /*!< MPU CTRL: HFNMIENA Mask */
+
+#define MPU_CTRL_ENABLE_Pos 0 /*!< MPU CTRL: ENABLE Position */
+#define MPU_CTRL_ENABLE_Msk (1UL /*<< MPU_CTRL_ENABLE_Pos*/) /*!< MPU CTRL: ENABLE Mask */
+
+/* MPU Region Number Register */
+#define MPU_RNR_REGION_Pos 0 /*!< MPU RNR: REGION Position */
+#define MPU_RNR_REGION_Msk (0xFFUL /*<< MPU_RNR_REGION_Pos*/) /*!< MPU RNR: REGION Mask */
+
+/* MPU Region Base Address Register */
+#define MPU_RBAR_ADDR_Pos 5 /*!< MPU RBAR: ADDR Position */
+#define MPU_RBAR_ADDR_Msk (0x7FFFFFFUL << MPU_RBAR_ADDR_Pos) /*!< MPU RBAR: ADDR Mask */
+
+#define MPU_RBAR_VALID_Pos 4 /*!< MPU RBAR: VALID Position */
+#define MPU_RBAR_VALID_Msk (1UL << MPU_RBAR_VALID_Pos) /*!< MPU RBAR: VALID Mask */
+
+#define MPU_RBAR_REGION_Pos 0 /*!< MPU RBAR: REGION Position */
+#define MPU_RBAR_REGION_Msk (0xFUL /*<< MPU_RBAR_REGION_Pos*/) /*!< MPU RBAR: REGION Mask */
+
+/* MPU Region Attribute and Size Register */
+#define MPU_RASR_ATTRS_Pos 16 /*!< MPU RASR: MPU Region Attribute field Position */
+#define MPU_RASR_ATTRS_Msk (0xFFFFUL << MPU_RASR_ATTRS_Pos) /*!< MPU RASR: MPU Region Attribute field Mask */
+
+#define MPU_RASR_XN_Pos 28 /*!< MPU RASR: ATTRS.XN Position */
+#define MPU_RASR_XN_Msk (1UL << MPU_RASR_XN_Pos) /*!< MPU RASR: ATTRS.XN Mask */
+
+#define MPU_RASR_AP_Pos 24 /*!< MPU RASR: ATTRS.AP Position */
+#define MPU_RASR_AP_Msk (0x7UL << MPU_RASR_AP_Pos) /*!< MPU RASR: ATTRS.AP Mask */
+
+#define MPU_RASR_TEX_Pos 19 /*!< MPU RASR: ATTRS.TEX Position */
+#define MPU_RASR_TEX_Msk (0x7UL << MPU_RASR_TEX_Pos) /*!< MPU RASR: ATTRS.TEX Mask */
+
+#define MPU_RASR_S_Pos 18 /*!< MPU RASR: ATTRS.S Position */
+#define MPU_RASR_S_Msk (1UL << MPU_RASR_S_Pos) /*!< MPU RASR: ATTRS.S Mask */
+
+#define MPU_RASR_C_Pos 17 /*!< MPU RASR: ATTRS.C Position */
+#define MPU_RASR_C_Msk (1UL << MPU_RASR_C_Pos) /*!< MPU RASR: ATTRS.C Mask */
+
+#define MPU_RASR_B_Pos 16 /*!< MPU RASR: ATTRS.B Position */
+#define MPU_RASR_B_Msk (1UL << MPU_RASR_B_Pos) /*!< MPU RASR: ATTRS.B Mask */
+
+#define MPU_RASR_SRD_Pos 8 /*!< MPU RASR: Sub-Region Disable Position */
+#define MPU_RASR_SRD_Msk (0xFFUL << MPU_RASR_SRD_Pos) /*!< MPU RASR: Sub-Region Disable Mask */
+
+#define MPU_RASR_SIZE_Pos 1 /*!< MPU RASR: Region Size Field Position */
+#define MPU_RASR_SIZE_Msk (0x1FUL << MPU_RASR_SIZE_Pos) /*!< MPU RASR: Region Size Field Mask */
+
+#define MPU_RASR_ENABLE_Pos 0 /*!< MPU RASR: Region enable bit Position */
+#define MPU_RASR_ENABLE_Msk (1UL /*<< MPU_RASR_ENABLE_Pos*/) /*!< MPU RASR: Region enable bit Disable Mask */
+
+/*@} end of group CMSIS_MPU */
+#endif
+
+
+#if (__FPU_PRESENT == 1)
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_FPU Floating Point Unit (FPU)
+ \brief Type definitions for the Floating Point Unit (FPU)
+ @{
+ */
+
+/** \brief Structure type to access the Floating Point Unit (FPU).
+ */
+typedef struct
+{
+ uint32_t RESERVED0[1];
+ __IO uint32_t FPCCR; /*!< Offset: 0x004 (R/W) Floating-Point Context Control Register */
+ __IO uint32_t FPCAR; /*!< Offset: 0x008 (R/W) Floating-Point Context Address Register */
+ __IO uint32_t FPDSCR; /*!< Offset: 0x00C (R/W) Floating-Point Default Status Control Register */
+ __I uint32_t MVFR0; /*!< Offset: 0x010 (R/ ) Media and FP Feature Register 0 */
+ __I uint32_t MVFR1; /*!< Offset: 0x014 (R/ ) Media and FP Feature Register 1 */
+} FPU_Type;
+
+/* Floating-Point Context Control Register */
+#define FPU_FPCCR_ASPEN_Pos 31 /*!< FPCCR: ASPEN bit Position */
+#define FPU_FPCCR_ASPEN_Msk (1UL << FPU_FPCCR_ASPEN_Pos) /*!< FPCCR: ASPEN bit Mask */
+
+#define FPU_FPCCR_LSPEN_Pos 30 /*!< FPCCR: LSPEN Position */
+#define FPU_FPCCR_LSPEN_Msk (1UL << FPU_FPCCR_LSPEN_Pos) /*!< FPCCR: LSPEN bit Mask */
+
+#define FPU_FPCCR_MONRDY_Pos 8 /*!< FPCCR: MONRDY Position */
+#define FPU_FPCCR_MONRDY_Msk (1UL << FPU_FPCCR_MONRDY_Pos) /*!< FPCCR: MONRDY bit Mask */
+
+#define FPU_FPCCR_BFRDY_Pos 6 /*!< FPCCR: BFRDY Position */
+#define FPU_FPCCR_BFRDY_Msk (1UL << FPU_FPCCR_BFRDY_Pos) /*!< FPCCR: BFRDY bit Mask */
+
+#define FPU_FPCCR_MMRDY_Pos 5 /*!< FPCCR: MMRDY Position */
+#define FPU_FPCCR_MMRDY_Msk (1UL << FPU_FPCCR_MMRDY_Pos) /*!< FPCCR: MMRDY bit Mask */
+
+#define FPU_FPCCR_HFRDY_Pos 4 /*!< FPCCR: HFRDY Position */
+#define FPU_FPCCR_HFRDY_Msk (1UL << FPU_FPCCR_HFRDY_Pos) /*!< FPCCR: HFRDY bit Mask */
+
+#define FPU_FPCCR_THREAD_Pos 3 /*!< FPCCR: processor mode bit Position */
+#define FPU_FPCCR_THREAD_Msk (1UL << FPU_FPCCR_THREAD_Pos) /*!< FPCCR: processor mode active bit Mask */
+
+#define FPU_FPCCR_USER_Pos 1 /*!< FPCCR: privilege level bit Position */
+#define FPU_FPCCR_USER_Msk (1UL << FPU_FPCCR_USER_Pos) /*!< FPCCR: privilege level bit Mask */
+
+#define FPU_FPCCR_LSPACT_Pos 0 /*!< FPCCR: Lazy state preservation active bit Position */
+#define FPU_FPCCR_LSPACT_Msk (1UL /*<< FPU_FPCCR_LSPACT_Pos*/) /*!< FPCCR: Lazy state preservation active bit Mask */
+
+/* Floating-Point Context Address Register */
+#define FPU_FPCAR_ADDRESS_Pos 3 /*!< FPCAR: ADDRESS bit Position */
+#define FPU_FPCAR_ADDRESS_Msk (0x1FFFFFFFUL << FPU_FPCAR_ADDRESS_Pos) /*!< FPCAR: ADDRESS bit Mask */
+
+/* Floating-Point Default Status Control Register */
+#define FPU_FPDSCR_AHP_Pos 26 /*!< FPDSCR: AHP bit Position */
+#define FPU_FPDSCR_AHP_Msk (1UL << FPU_FPDSCR_AHP_Pos) /*!< FPDSCR: AHP bit Mask */
+
+#define FPU_FPDSCR_DN_Pos 25 /*!< FPDSCR: DN bit Position */
+#define FPU_FPDSCR_DN_Msk (1UL << FPU_FPDSCR_DN_Pos) /*!< FPDSCR: DN bit Mask */
+
+#define FPU_FPDSCR_FZ_Pos 24 /*!< FPDSCR: FZ bit Position */
+#define FPU_FPDSCR_FZ_Msk (1UL << FPU_FPDSCR_FZ_Pos) /*!< FPDSCR: FZ bit Mask */
+
+#define FPU_FPDSCR_RMode_Pos 22 /*!< FPDSCR: RMode bit Position */
+#define FPU_FPDSCR_RMode_Msk (3UL << FPU_FPDSCR_RMode_Pos) /*!< FPDSCR: RMode bit Mask */
+
+/* Media and FP Feature Register 0 */
+#define FPU_MVFR0_FP_rounding_modes_Pos 28 /*!< MVFR0: FP rounding modes bits Position */
+#define FPU_MVFR0_FP_rounding_modes_Msk (0xFUL << FPU_MVFR0_FP_rounding_modes_Pos) /*!< MVFR0: FP rounding modes bits Mask */
+
+#define FPU_MVFR0_Short_vectors_Pos 24 /*!< MVFR0: Short vectors bits Position */
+#define FPU_MVFR0_Short_vectors_Msk (0xFUL << FPU_MVFR0_Short_vectors_Pos) /*!< MVFR0: Short vectors bits Mask */
+
+#define FPU_MVFR0_Square_root_Pos 20 /*!< MVFR0: Square root bits Position */
+#define FPU_MVFR0_Square_root_Msk (0xFUL << FPU_MVFR0_Square_root_Pos) /*!< MVFR0: Square root bits Mask */
+
+#define FPU_MVFR0_Divide_Pos 16 /*!< MVFR0: Divide bits Position */
+#define FPU_MVFR0_Divide_Msk (0xFUL << FPU_MVFR0_Divide_Pos) /*!< MVFR0: Divide bits Mask */
+
+#define FPU_MVFR0_FP_excep_trapping_Pos 12 /*!< MVFR0: FP exception trapping bits Position */
+#define FPU_MVFR0_FP_excep_trapping_Msk (0xFUL << FPU_MVFR0_FP_excep_trapping_Pos) /*!< MVFR0: FP exception trapping bits Mask */
+
+#define FPU_MVFR0_Double_precision_Pos 8 /*!< MVFR0: Double-precision bits Position */
+#define FPU_MVFR0_Double_precision_Msk (0xFUL << FPU_MVFR0_Double_precision_Pos) /*!< MVFR0: Double-precision bits Mask */
+
+#define FPU_MVFR0_Single_precision_Pos 4 /*!< MVFR0: Single-precision bits Position */
+#define FPU_MVFR0_Single_precision_Msk (0xFUL << FPU_MVFR0_Single_precision_Pos) /*!< MVFR0: Single-precision bits Mask */
+
+#define FPU_MVFR0_A_SIMD_registers_Pos 0 /*!< MVFR0: A_SIMD registers bits Position */
+#define FPU_MVFR0_A_SIMD_registers_Msk (0xFUL /*<< FPU_MVFR0_A_SIMD_registers_Pos*/) /*!< MVFR0: A_SIMD registers bits Mask */
+
+/* Media and FP Feature Register 1 */
+#define FPU_MVFR1_FP_fused_MAC_Pos 28 /*!< MVFR1: FP fused MAC bits Position */
+#define FPU_MVFR1_FP_fused_MAC_Msk (0xFUL << FPU_MVFR1_FP_fused_MAC_Pos) /*!< MVFR1: FP fused MAC bits Mask */
+
+#define FPU_MVFR1_FP_HPFP_Pos 24 /*!< MVFR1: FP HPFP bits Position */
+#define FPU_MVFR1_FP_HPFP_Msk (0xFUL << FPU_MVFR1_FP_HPFP_Pos) /*!< MVFR1: FP HPFP bits Mask */
+
+#define FPU_MVFR1_D_NaN_mode_Pos 4 /*!< MVFR1: D_NaN mode bits Position */
+#define FPU_MVFR1_D_NaN_mode_Msk (0xFUL << FPU_MVFR1_D_NaN_mode_Pos) /*!< MVFR1: D_NaN mode bits Mask */
+
+#define FPU_MVFR1_FtZ_mode_Pos 0 /*!< MVFR1: FtZ mode bits Position */
+#define FPU_MVFR1_FtZ_mode_Msk (0xFUL /*<< FPU_MVFR1_FtZ_mode_Pos*/) /*!< MVFR1: FtZ mode bits Mask */
+
+/*@} end of group CMSIS_FPU */
+#endif
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_CoreDebug Core Debug Registers (CoreDebug)
+ \brief Type definitions for the Core Debug Registers
+ @{
+ */
+
+/** \brief Structure type to access the Core Debug Register (CoreDebug).
+ */
+typedef struct
+{
+ __IO uint32_t DHCSR; /*!< Offset: 0x000 (R/W) Debug Halting Control and Status Register */
+ __O uint32_t DCRSR; /*!< Offset: 0x004 ( /W) Debug Core Register Selector Register */
+ __IO uint32_t DCRDR; /*!< Offset: 0x008 (R/W) Debug Core Register Data Register */
+ __IO uint32_t DEMCR; /*!< Offset: 0x00C (R/W) Debug Exception and Monitor Control Register */
+} CoreDebug_Type;
+
+/* Debug Halting Control and Status Register */
+#define CoreDebug_DHCSR_DBGKEY_Pos 16 /*!< CoreDebug DHCSR: DBGKEY Position */
+#define CoreDebug_DHCSR_DBGKEY_Msk (0xFFFFUL << CoreDebug_DHCSR_DBGKEY_Pos) /*!< CoreDebug DHCSR: DBGKEY Mask */
+
+#define CoreDebug_DHCSR_S_RESET_ST_Pos 25 /*!< CoreDebug DHCSR: S_RESET_ST Position */
+#define CoreDebug_DHCSR_S_RESET_ST_Msk (1UL << CoreDebug_DHCSR_S_RESET_ST_Pos) /*!< CoreDebug DHCSR: S_RESET_ST Mask */
+
+#define CoreDebug_DHCSR_S_RETIRE_ST_Pos 24 /*!< CoreDebug DHCSR: S_RETIRE_ST Position */
+#define CoreDebug_DHCSR_S_RETIRE_ST_Msk (1UL << CoreDebug_DHCSR_S_RETIRE_ST_Pos) /*!< CoreDebug DHCSR: S_RETIRE_ST Mask */
+
+#define CoreDebug_DHCSR_S_LOCKUP_Pos 19 /*!< CoreDebug DHCSR: S_LOCKUP Position */
+#define CoreDebug_DHCSR_S_LOCKUP_Msk (1UL << CoreDebug_DHCSR_S_LOCKUP_Pos) /*!< CoreDebug DHCSR: S_LOCKUP Mask */
+
+#define CoreDebug_DHCSR_S_SLEEP_Pos 18 /*!< CoreDebug DHCSR: S_SLEEP Position */
+#define CoreDebug_DHCSR_S_SLEEP_Msk (1UL << CoreDebug_DHCSR_S_SLEEP_Pos) /*!< CoreDebug DHCSR: S_SLEEP Mask */
+
+#define CoreDebug_DHCSR_S_HALT_Pos 17 /*!< CoreDebug DHCSR: S_HALT Position */
+#define CoreDebug_DHCSR_S_HALT_Msk (1UL << CoreDebug_DHCSR_S_HALT_Pos) /*!< CoreDebug DHCSR: S_HALT Mask */
+
+#define CoreDebug_DHCSR_S_REGRDY_Pos 16 /*!< CoreDebug DHCSR: S_REGRDY Position */
+#define CoreDebug_DHCSR_S_REGRDY_Msk (1UL << CoreDebug_DHCSR_S_REGRDY_Pos) /*!< CoreDebug DHCSR: S_REGRDY Mask */
+
+#define CoreDebug_DHCSR_C_SNAPSTALL_Pos 5 /*!< CoreDebug DHCSR: C_SNAPSTALL Position */
+#define CoreDebug_DHCSR_C_SNAPSTALL_Msk (1UL << CoreDebug_DHCSR_C_SNAPSTALL_Pos) /*!< CoreDebug DHCSR: C_SNAPSTALL Mask */
+
+#define CoreDebug_DHCSR_C_MASKINTS_Pos 3 /*!< CoreDebug DHCSR: C_MASKINTS Position */
+#define CoreDebug_DHCSR_C_MASKINTS_Msk (1UL << CoreDebug_DHCSR_C_MASKINTS_Pos) /*!< CoreDebug DHCSR: C_MASKINTS Mask */
+
+#define CoreDebug_DHCSR_C_STEP_Pos 2 /*!< CoreDebug DHCSR: C_STEP Position */
+#define CoreDebug_DHCSR_C_STEP_Msk (1UL << CoreDebug_DHCSR_C_STEP_Pos) /*!< CoreDebug DHCSR: C_STEP Mask */
+
+#define CoreDebug_DHCSR_C_HALT_Pos 1 /*!< CoreDebug DHCSR: C_HALT Position */
+#define CoreDebug_DHCSR_C_HALT_Msk (1UL << CoreDebug_DHCSR_C_HALT_Pos) /*!< CoreDebug DHCSR: C_HALT Mask */
+
+#define CoreDebug_DHCSR_C_DEBUGEN_Pos 0 /*!< CoreDebug DHCSR: C_DEBUGEN Position */
+#define CoreDebug_DHCSR_C_DEBUGEN_Msk (1UL /*<< CoreDebug_DHCSR_C_DEBUGEN_Pos*/) /*!< CoreDebug DHCSR: C_DEBUGEN Mask */
+
+/* Debug Core Register Selector Register */
+#define CoreDebug_DCRSR_REGWnR_Pos 16 /*!< CoreDebug DCRSR: REGWnR Position */
+#define CoreDebug_DCRSR_REGWnR_Msk (1UL << CoreDebug_DCRSR_REGWnR_Pos) /*!< CoreDebug DCRSR: REGWnR Mask */
+
+#define CoreDebug_DCRSR_REGSEL_Pos 0 /*!< CoreDebug DCRSR: REGSEL Position */
+#define CoreDebug_DCRSR_REGSEL_Msk (0x1FUL /*<< CoreDebug_DCRSR_REGSEL_Pos*/) /*!< CoreDebug DCRSR: REGSEL Mask */
+
+/* Debug Exception and Monitor Control Register */
+#define CoreDebug_DEMCR_TRCENA_Pos 24 /*!< CoreDebug DEMCR: TRCENA Position */
+#define CoreDebug_DEMCR_TRCENA_Msk (1UL << CoreDebug_DEMCR_TRCENA_Pos) /*!< CoreDebug DEMCR: TRCENA Mask */
+
+#define CoreDebug_DEMCR_MON_REQ_Pos 19 /*!< CoreDebug DEMCR: MON_REQ Position */
+#define CoreDebug_DEMCR_MON_REQ_Msk (1UL << CoreDebug_DEMCR_MON_REQ_Pos) /*!< CoreDebug DEMCR: MON_REQ Mask */
+
+#define CoreDebug_DEMCR_MON_STEP_Pos 18 /*!< CoreDebug DEMCR: MON_STEP Position */
+#define CoreDebug_DEMCR_MON_STEP_Msk (1UL << CoreDebug_DEMCR_MON_STEP_Pos) /*!< CoreDebug DEMCR: MON_STEP Mask */
+
+#define CoreDebug_DEMCR_MON_PEND_Pos 17 /*!< CoreDebug DEMCR: MON_PEND Position */
+#define CoreDebug_DEMCR_MON_PEND_Msk (1UL << CoreDebug_DEMCR_MON_PEND_Pos) /*!< CoreDebug DEMCR: MON_PEND Mask */
+
+#define CoreDebug_DEMCR_MON_EN_Pos 16 /*!< CoreDebug DEMCR: MON_EN Position */
+#define CoreDebug_DEMCR_MON_EN_Msk (1UL << CoreDebug_DEMCR_MON_EN_Pos) /*!< CoreDebug DEMCR: MON_EN Mask */
+
+#define CoreDebug_DEMCR_VC_HARDERR_Pos 10 /*!< CoreDebug DEMCR: VC_HARDERR Position */
+#define CoreDebug_DEMCR_VC_HARDERR_Msk (1UL << CoreDebug_DEMCR_VC_HARDERR_Pos) /*!< CoreDebug DEMCR: VC_HARDERR Mask */
+
+#define CoreDebug_DEMCR_VC_INTERR_Pos 9 /*!< CoreDebug DEMCR: VC_INTERR Position */
+#define CoreDebug_DEMCR_VC_INTERR_Msk (1UL << CoreDebug_DEMCR_VC_INTERR_Pos) /*!< CoreDebug DEMCR: VC_INTERR Mask */
+
+#define CoreDebug_DEMCR_VC_BUSERR_Pos 8 /*!< CoreDebug DEMCR: VC_BUSERR Position */
+#define CoreDebug_DEMCR_VC_BUSERR_Msk (1UL << CoreDebug_DEMCR_VC_BUSERR_Pos) /*!< CoreDebug DEMCR: VC_BUSERR Mask */
+
+#define CoreDebug_DEMCR_VC_STATERR_Pos 7 /*!< CoreDebug DEMCR: VC_STATERR Position */
+#define CoreDebug_DEMCR_VC_STATERR_Msk (1UL << CoreDebug_DEMCR_VC_STATERR_Pos) /*!< CoreDebug DEMCR: VC_STATERR Mask */
+
+#define CoreDebug_DEMCR_VC_CHKERR_Pos 6 /*!< CoreDebug DEMCR: VC_CHKERR Position */
+#define CoreDebug_DEMCR_VC_CHKERR_Msk (1UL << CoreDebug_DEMCR_VC_CHKERR_Pos) /*!< CoreDebug DEMCR: VC_CHKERR Mask */
+
+#define CoreDebug_DEMCR_VC_NOCPERR_Pos 5 /*!< CoreDebug DEMCR: VC_NOCPERR Position */
+#define CoreDebug_DEMCR_VC_NOCPERR_Msk (1UL << CoreDebug_DEMCR_VC_NOCPERR_Pos) /*!< CoreDebug DEMCR: VC_NOCPERR Mask */
+
+#define CoreDebug_DEMCR_VC_MMERR_Pos 4 /*!< CoreDebug DEMCR: VC_MMERR Position */
+#define CoreDebug_DEMCR_VC_MMERR_Msk (1UL << CoreDebug_DEMCR_VC_MMERR_Pos) /*!< CoreDebug DEMCR: VC_MMERR Mask */
+
+#define CoreDebug_DEMCR_VC_CORERESET_Pos 0 /*!< CoreDebug DEMCR: VC_CORERESET Position */
+#define CoreDebug_DEMCR_VC_CORERESET_Msk (1UL /*<< CoreDebug_DEMCR_VC_CORERESET_Pos*/) /*!< CoreDebug DEMCR: VC_CORERESET Mask */
+
+/*@} end of group CMSIS_CoreDebug */
+
+
+/** \ingroup CMSIS_core_register
+ \defgroup CMSIS_core_base Core Definitions
+ \brief Definitions for base addresses, unions, and structures.
+ @{
+ */
+
+/* Memory mapping of Cortex-M4 Hardware */
+#define SCS_BASE (0xE000E000UL) /*!< System Control Space Base Address */
+#define ITM_BASE (0xE0000000UL) /*!< ITM Base Address */
+#define DWT_BASE (0xE0001000UL) /*!< DWT Base Address */
+#define TPI_BASE (0xE0040000UL) /*!< TPI Base Address */
+#define CoreDebug_BASE (0xE000EDF0UL) /*!< Core Debug Base Address */
+#define SysTick_BASE (SCS_BASE + 0x0010UL) /*!< SysTick Base Address */
+#define NVIC_BASE (SCS_BASE + 0x0100UL) /*!< NVIC Base Address */
+#define SCB_BASE (SCS_BASE + 0x0D00UL) /*!< System Control Block Base Address */
+
+#define SCnSCB ((SCnSCB_Type *) SCS_BASE ) /*!< System control Register not in SCB */
+#define SCB ((SCB_Type *) SCB_BASE ) /*!< SCB configuration struct */
+#define SysTick ((SysTick_Type *) SysTick_BASE ) /*!< SysTick configuration struct */
+#define NVIC ((NVIC_Type *) NVIC_BASE ) /*!< NVIC configuration struct */
+#define ITM ((ITM_Type *) ITM_BASE ) /*!< ITM configuration struct */
+#define DWT ((DWT_Type *) DWT_BASE ) /*!< DWT configuration struct */
+#define TPI ((TPI_Type *) TPI_BASE ) /*!< TPI configuration struct */
+#define CoreDebug ((CoreDebug_Type *) CoreDebug_BASE) /*!< Core Debug configuration struct */
+
+#if (__MPU_PRESENT == 1)
+ #define MPU_BASE (SCS_BASE + 0x0D90UL) /*!< Memory Protection Unit */
+ #define MPU ((MPU_Type *) MPU_BASE ) /*!< Memory Protection Unit */
+#endif
+
+#if (__FPU_PRESENT == 1)
+ #define FPU_BASE (SCS_BASE + 0x0F30UL) /*!< Floating Point Unit */
+ #define FPU ((FPU_Type *) FPU_BASE ) /*!< Floating Point Unit */
+#endif
+
+/*@} */
+
+
+
+/*******************************************************************************
+ * Hardware Abstraction Layer
+ Core Function Interface contains:
+ - Core NVIC Functions
+ - Core SysTick Functions
+ - Core Debug Functions
+ - Core Register Access Functions
+ ******************************************************************************/
+/** \defgroup CMSIS_Core_FunctionInterface Functions and Instructions Reference
+*/
+
+
+
+/* ########################## NVIC functions #################################### */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_NVICFunctions NVIC Functions
+ \brief Functions that manage interrupts and exceptions via the NVIC.
+ @{
+ */
+
+/** \brief Set Priority Grouping
+
+ The function sets the priority grouping field using the required unlock sequence.
+ The parameter PriorityGroup is assigned to the field SCB->AIRCR [10:8] PRIGROUP field.
+ Only values from 0..7 are used.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+
+ \param [in] PriorityGroup Priority grouping field.
+ */
+__STATIC_INLINE void NVIC_SetPriorityGrouping(uint32_t PriorityGroup)
+{
+ uint32_t reg_value;
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+
+ reg_value = SCB->AIRCR; /* read old register configuration */
+ reg_value &= ~((uint32_t)(SCB_AIRCR_VECTKEY_Msk | SCB_AIRCR_PRIGROUP_Msk)); /* clear bits to change */
+ reg_value = (reg_value |
+ ((uint32_t)0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (PriorityGroupTmp << 8) ); /* Insert write key and priorty group */
+ SCB->AIRCR = reg_value;
+}
+
+
+/** \brief Get Priority Grouping
+
+ The function reads the priority grouping field from the NVIC Interrupt Controller.
+
+ \return Priority grouping field (SCB->AIRCR [10:8] PRIGROUP field).
+ */
+__STATIC_INLINE uint32_t NVIC_GetPriorityGrouping(void)
+{
+ return ((uint32_t)((SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) >> SCB_AIRCR_PRIGROUP_Pos));
+}
+
+
+/** \brief Enable External Interrupt
+
+ The function enables a device-specific interrupt in the NVIC interrupt controller.
+
+ \param [in] IRQn External interrupt number. Value cannot be negative.
+ */
+__STATIC_INLINE void NVIC_EnableIRQ(IRQn_Type IRQn)
+{
+ NVIC->ISER[(((uint32_t)(int32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL));
+}
+
+
+/** \brief Disable External Interrupt
+
+ The function disables a device-specific interrupt in the NVIC interrupt controller.
+
+ \param [in] IRQn External interrupt number. Value cannot be negative.
+ */
+__STATIC_INLINE void NVIC_DisableIRQ(IRQn_Type IRQn)
+{
+ NVIC->ICER[(((uint32_t)(int32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL));
+}
+
+
+/** \brief Get Pending Interrupt
+
+ The function reads the pending register in the NVIC and returns the pending bit
+ for the specified interrupt.
+
+ \param [in] IRQn Interrupt number.
+
+ \return 0 Interrupt status is not pending.
+ \return 1 Interrupt status is pending.
+ */
+__STATIC_INLINE uint32_t NVIC_GetPendingIRQ(IRQn_Type IRQn)
+{
+ return((uint32_t)(((NVIC->ISPR[(((uint32_t)(int32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+}
+
+
+/** \brief Set Pending Interrupt
+
+ The function sets the pending bit of an external interrupt.
+
+ \param [in] IRQn Interrupt number. Value cannot be negative.
+ */
+__STATIC_INLINE void NVIC_SetPendingIRQ(IRQn_Type IRQn)
+{
+ NVIC->ISPR[(((uint32_t)(int32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL));
+}
+
+
+/** \brief Clear Pending Interrupt
+
+ The function clears the pending bit of an external interrupt.
+
+ \param [in] IRQn External interrupt number. Value cannot be negative.
+ */
+__STATIC_INLINE void NVIC_ClearPendingIRQ(IRQn_Type IRQn)
+{
+ NVIC->ICPR[(((uint32_t)(int32_t)IRQn) >> 5UL)] = (uint32_t)(1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL));
+}
+
+
+/** \brief Get Active Interrupt
+
+ The function reads the active register in NVIC and returns the active bit.
+
+ \param [in] IRQn Interrupt number.
+
+ \return 0 Interrupt status is not active.
+ \return 1 Interrupt status is active.
+ */
+__STATIC_INLINE uint32_t NVIC_GetActive(IRQn_Type IRQn)
+{
+ return((uint32_t)(((NVIC->IABR[(((uint32_t)(int32_t)IRQn) >> 5UL)] & (1UL << (((uint32_t)(int32_t)IRQn) & 0x1FUL))) != 0UL) ? 1UL : 0UL));
+}
+
+
+/** \brief Set Interrupt Priority
+
+ The function sets the priority of an interrupt.
+
+ \note The priority cannot be set for every core interrupt.
+
+ \param [in] IRQn Interrupt number.
+ \param [in] priority Priority to set.
+ */
+__STATIC_INLINE void NVIC_SetPriority(IRQn_Type IRQn, uint32_t priority)
+{
+ if((int32_t)IRQn < 0) {
+ SCB->SHP[(((uint32_t)(int32_t)IRQn) & 0xFUL)-4UL] = (uint8_t)((priority << (8 - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+ else {
+ NVIC->IP[((uint32_t)(int32_t)IRQn)] = (uint8_t)((priority << (8 - __NVIC_PRIO_BITS)) & (uint32_t)0xFFUL);
+ }
+}
+
+
+/** \brief Get Interrupt Priority
+
+ The function reads the priority of an interrupt. The interrupt
+ number can be positive to specify an external (device specific)
+ interrupt, or negative to specify an internal (core) interrupt.
+
+
+ \param [in] IRQn Interrupt number.
+ \return Interrupt Priority. Value is aligned automatically to the implemented
+ priority bits of the microcontroller.
+ */
+__STATIC_INLINE uint32_t NVIC_GetPriority(IRQn_Type IRQn)
+{
+
+ if((int32_t)IRQn < 0) {
+ return(((uint32_t)SCB->SHP[(((uint32_t)(int32_t)IRQn) & 0xFUL)-4UL] >> (8 - __NVIC_PRIO_BITS)));
+ }
+ else {
+ return(((uint32_t)NVIC->IP[((uint32_t)(int32_t)IRQn)] >> (8 - __NVIC_PRIO_BITS)));
+ }
+}
+
+
+/** \brief Encode Priority
+
+ The function encodes the priority for an interrupt with the given priority group,
+ preemptive priority value, and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
+
+ \param [in] PriorityGroup Used priority group.
+ \param [in] PreemptPriority Preemptive priority value (starting from 0).
+ \param [in] SubPriority Subpriority value (starting from 0).
+ \return Encoded priority. Value can be used in the function \ref NVIC_SetPriority().
+ */
+__STATIC_INLINE uint32_t NVIC_EncodePriority (uint32_t PriorityGroup, uint32_t PreemptPriority, uint32_t SubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ return (
+ ((PreemptPriority & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL)) << SubPriorityBits) |
+ ((SubPriority & (uint32_t)((1UL << (SubPriorityBits )) - 1UL)))
+ );
+}
+
+
+/** \brief Decode Priority
+
+ The function decodes an interrupt priority value with a given priority group to
+ preemptive priority value and subpriority value.
+ In case of a conflict between priority grouping and available
+ priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
+
+ \param [in] Priority Priority value, which can be retrieved with the function \ref NVIC_GetPriority().
+ \param [in] PriorityGroup Used priority group.
+ \param [out] pPreemptPriority Preemptive priority value (starting from 0).
+ \param [out] pSubPriority Subpriority value (starting from 0).
+ */
+__STATIC_INLINE void NVIC_DecodePriority (uint32_t Priority, uint32_t PriorityGroup, uint32_t* pPreemptPriority, uint32_t* pSubPriority)
+{
+ uint32_t PriorityGroupTmp = (PriorityGroup & (uint32_t)0x07UL); /* only values 0..7 are used */
+ uint32_t PreemptPriorityBits;
+ uint32_t SubPriorityBits;
+
+ PreemptPriorityBits = ((7UL - PriorityGroupTmp) > (uint32_t)(__NVIC_PRIO_BITS)) ? (uint32_t)(__NVIC_PRIO_BITS) : (uint32_t)(7UL - PriorityGroupTmp);
+ SubPriorityBits = ((PriorityGroupTmp + (uint32_t)(__NVIC_PRIO_BITS)) < (uint32_t)7UL) ? (uint32_t)0UL : (uint32_t)((PriorityGroupTmp - 7UL) + (uint32_t)(__NVIC_PRIO_BITS));
+
+ *pPreemptPriority = (Priority >> SubPriorityBits) & (uint32_t)((1UL << (PreemptPriorityBits)) - 1UL);
+ *pSubPriority = (Priority ) & (uint32_t)((1UL << (SubPriorityBits )) - 1UL);
+}
+
+
+/** \brief System Reset
+
+ The function initiates a system reset request to reset the MCU.
+ */
+__STATIC_INLINE void NVIC_SystemReset(void)
+{
+ __DSB(); /* Ensure all outstanding memory accesses included
+ buffered write are completed before reset */
+ SCB->AIRCR = (uint32_t)((0x5FAUL << SCB_AIRCR_VECTKEY_Pos) |
+ (SCB->AIRCR & SCB_AIRCR_PRIGROUP_Msk) |
+ SCB_AIRCR_SYSRESETREQ_Msk ); /* Keep priority group unchanged */
+ __DSB(); /* Ensure completion of memory access */
+ while(1) { __NOP(); } /* wait until reset */
+}
+
+/*@} end of CMSIS_Core_NVICFunctions */
+
+
+
+/* ################################## SysTick function ############################################ */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_SysTickFunctions SysTick Functions
+ \brief Functions that configure the System.
+ @{
+ */
+
+#if (__Vendor_SysTickConfig == 0)
+
+/** \brief System Tick Configuration
+
+ The function initializes the System Timer and its interrupt, and starts the System Tick Timer.
+ Counter is in free running mode to generate periodic interrupts.
+
+ \param [in] ticks Number of ticks between two interrupts.
+
+ \return 0 Function succeeded.
+ \return 1 Function failed.
+
+ \note When the variable __Vendor_SysTickConfig is set to 1, then the
+ function SysTick_Config is not included. In this case, the file device.h
+ must contain a vendor-specific implementation of this function.
+
+ */
+__STATIC_INLINE uint32_t SysTick_Config(uint32_t ticks)
+{
+ if ((ticks - 1UL) > SysTick_LOAD_RELOAD_Msk) { return (1UL); } /* Reload value impossible */
+
+ SysTick->LOAD = (uint32_t)(ticks - 1UL); /* set reload register */
+ NVIC_SetPriority (SysTick_IRQn, (1UL << __NVIC_PRIO_BITS) - 1UL); /* set Priority for Systick Interrupt */
+ SysTick->VAL = 0UL; /* Load the SysTick Counter Value */
+ SysTick->CTRL = SysTick_CTRL_CLKSOURCE_Msk |
+ SysTick_CTRL_TICKINT_Msk |
+ SysTick_CTRL_ENABLE_Msk; /* Enable SysTick IRQ and SysTick Timer */
+ return (0UL); /* Function successful */
+}
+
+#endif
+
+/*@} end of CMSIS_Core_SysTickFunctions */
+
+
+
+/* ##################################### Debug In/Output function ########################################### */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_core_DebugFunctions ITM Functions
+ \brief Functions that access the ITM debug interface.
+ @{
+ */
+
+extern volatile int32_t ITM_RxBuffer; /*!< External variable to receive characters. */
+#define ITM_RXBUFFER_EMPTY 0x5AA55AA5 /*!< Value identifying \ref ITM_RxBuffer is ready for next character. */
+
+
+/** \brief ITM Send Character
+
+ The function transmits a character via the ITM channel 0, and
+ \li Just returns when no debugger is connected that has booked the output.
+ \li Is blocking when a debugger is connected, but the previous character sent has not been transmitted.
+
+ \param [in] ch Character to transmit.
+
+ \returns Character to transmit.
+ */
+__STATIC_INLINE uint32_t ITM_SendChar (uint32_t ch)
+{
+ if (((ITM->TCR & ITM_TCR_ITMENA_Msk) != 0UL) && /* ITM enabled */
+ ((ITM->TER & 1UL ) != 0UL) ) /* ITM Port #0 enabled */
+ {
+ while (ITM->PORT[0].u32 == 0UL) { __NOP(); }
+ ITM->PORT[0].u8 = (uint8_t)ch;
+ }
+ return (ch);
+}
+
+
+/** \brief ITM Receive Character
+
+ The function inputs a character via the external variable \ref ITM_RxBuffer.
+
+ \return Received character.
+ \return -1 No character pending.
+ */
+__STATIC_INLINE int32_t ITM_ReceiveChar (void) {
+ int32_t ch = -1; /* no character available */
+
+ if (ITM_RxBuffer != ITM_RXBUFFER_EMPTY) {
+ ch = ITM_RxBuffer;
+ ITM_RxBuffer = ITM_RXBUFFER_EMPTY; /* ready for next character */
+ }
+
+ return (ch);
+}
+
+
+/** \brief ITM Check Character
+
+ The function checks whether a character is pending for reading in the variable \ref ITM_RxBuffer.
+
+ \return 0 No character available.
+ \return 1 Character available.
+ */
+__STATIC_INLINE int32_t ITM_CheckChar (void) {
+
+ if (ITM_RxBuffer == ITM_RXBUFFER_EMPTY) {
+ return (0); /* no character available */
+ } else {
+ return (1); /* character available */
+ }
+}
+
+/*@} end of CMSIS_core_DebugFunctions */
+
+
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CM4_H_DEPENDANT */
+
+#endif /* __CMSIS_GENERIC */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmFunc.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmFunc.h
new file mode 100644
index 0000000000..834bd17645
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmFunc.h
@@ -0,0 +1,664 @@
+/**************************************************************************//**
+ * @file core_cmFunc.h
+ * @brief CMSIS Cortex-M Core Function Access Header File
+ * @version V4.10
+ * @date 18. March 2015
+ *
+ * @note
+ *
+ ******************************************************************************/
+/* Copyright (c) 2009 - 2015 ARM LIMITED
+
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of ARM nor the names of its contributors may be used
+ to endorse or promote products derived from this software without
+ specific prior written permission.
+ *
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS AND CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------*/
+
+
+#ifndef __CORE_CMFUNC_H
+#define __CORE_CMFUNC_H
+
+
+/* ########################### Core Function Access ########################### */
+/** \ingroup CMSIS_Core_FunctionInterface
+ \defgroup CMSIS_Core_RegAccFunctions CMSIS Core Register Access Functions
+ @{
+ */
+
+#if defined ( __CC_ARM ) /*------------------RealView Compiler -----------------*/
+/* ARM armcc specific functions */
+
+#if (__ARMCC_VERSION < 400677)
+ #error "Please use ARM Compiler Toolchain V4.0.677 or later!"
+#endif
+
+/* intrinsic void __enable_irq(); */
+/* intrinsic void __disable_irq(); */
+
+/** \brief Get Control Register
+
+ This function returns the content of the Control Register.
+
+ \return Control Register value
+ */
+__STATIC_INLINE uint32_t __get_CONTROL(void)
+{
+ register uint32_t __regControl __ASM("control");
+ return(__regControl);
+}
+
+
+/** \brief Set Control Register
+
+ This function writes the given value to the Control Register.
+
+ \param [in] control Control Register value to set
+ */
+__STATIC_INLINE void __set_CONTROL(uint32_t control)
+{
+ register uint32_t __regControl __ASM("control");
+ __regControl = control;
+}
+
+
+/** \brief Get IPSR Register
+
+ This function returns the content of the IPSR Register.
+
+ \return IPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_IPSR(void)
+{
+ register uint32_t __regIPSR __ASM("ipsr");
+ return(__regIPSR);
+}
+
+
+/** \brief Get APSR Register
+
+ This function returns the content of the APSR Register.
+
+ \return APSR Register value
+ */
+__STATIC_INLINE uint32_t __get_APSR(void)
+{
+ register uint32_t __regAPSR __ASM("apsr");
+ return(__regAPSR);
+}
+
+
+/** \brief Get xPSR Register
+
+ This function returns the content of the xPSR Register.
+
+ \return xPSR Register value
+ */
+__STATIC_INLINE uint32_t __get_xPSR(void)
+{
+ register uint32_t __regXPSR __ASM("xpsr");
+ return(__regXPSR);
+}
+
+
+/** \brief Get Process Stack Pointer
+
+ This function returns the current value of the Process Stack Pointer (PSP).
+
+ \return PSP Register value
+ */
+__STATIC_INLINE uint32_t __get_PSP(void)
+{
+ register uint32_t __regProcessStackPointer __ASM("psp");
+ return(__regProcessStackPointer);
+}
+
+
+/** \brief Set Process Stack Pointer
+
+ This function assigns the given value to the Process Stack Pointer (PSP).
+
+ \param [in] topOfProcStack Process Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_PSP(uint32_t topOfProcStack)
+{
+ register uint32_t __regProcessStackPointer __ASM("psp");
+ __regProcessStackPointer = topOfProcStack;
+}
+
+
+/** \brief Get Main Stack Pointer
+
+ This function returns the current value of the Main Stack Pointer (MSP).
+
+ \return MSP Register value
+ */
+__STATIC_INLINE uint32_t __get_MSP(void)
+{
+ register uint32_t __regMainStackPointer __ASM("msp");
+ return(__regMainStackPointer);
+}
+
+
+/** \brief Set Main Stack Pointer
+
+ This function assigns the given value to the Main Stack Pointer (MSP).
+
+ \param [in] topOfMainStack Main Stack Pointer value to set
+ */
+__STATIC_INLINE void __set_MSP(uint32_t topOfMainStack)
+{
+ register uint32_t __regMainStackPointer __ASM("msp");
+ __regMainStackPointer = topOfMainStack;
+}
+
+
+/** \brief Get Priority Mask
+
+ This function returns the current state of the priority mask bit from the Priority Mask Register.
+
+ \return Priority Mask value
+ */
+__STATIC_INLINE uint32_t __get_PRIMASK(void)
+{
+ register uint32_t __regPriMask __ASM("primask");
+ return(__regPriMask);
+}
+
+
+/** \brief Set Priority Mask
+
+ This function assigns the given value to the Priority Mask Register.
+
+ \param [in] priMask Priority Mask
+ */
+__STATIC_INLINE void __set_PRIMASK(uint32_t priMask)
+{
+ register uint32_t __regPriMask __ASM("primask");
+ __regPriMask = (priMask);
+}
+
+
+#if (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300)
+
+/** \brief Enable FIQ
+
+ This function enables FIQ interrupts by clearing the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+#define __enable_fault_irq __enable_fiq
+
+
+/** \brief Disable FIQ
+
+ This function disables FIQ interrupts by setting the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+#define __disable_fault_irq __disable_fiq
+
+
+/** \brief Get Base Priority
+
+ This function returns the current value of the Base Priority register.
+
+ \return Base Priority register value
+ */
+__STATIC_INLINE uint32_t __get_BASEPRI(void)
+{
+ register uint32_t __regBasePri __ASM("basepri");
+ return(__regBasePri);
+}
+
+
+/** \brief Set Base Priority
+
+ This function assigns the given value to the Base Priority register.
+
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI(uint32_t basePri)
+{
+ register uint32_t __regBasePri __ASM("basepri");
+ __regBasePri = (basePri & 0xff);
+}
+
+
+/** \brief Set Base Priority with condition
+
+ This function assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+ or the new value increases the BASEPRI priority level.
+
+ \param [in] basePri Base Priority value to set
+ */
+__STATIC_INLINE void __set_BASEPRI_MAX(uint32_t basePri)
+{
+ register uint32_t __regBasePriMax __ASM("basepri_max");
+ __regBasePriMax = (basePri & 0xff);
+}
+
+
+/** \brief Get Fault Mask
+
+ This function returns the current value of the Fault Mask register.
+
+ \return Fault Mask register value
+ */
+__STATIC_INLINE uint32_t __get_FAULTMASK(void)
+{
+ register uint32_t __regFaultMask __ASM("faultmask");
+ return(__regFaultMask);
+}
+
+
+/** \brief Set Fault Mask
+
+ This function assigns the given value to the Fault Mask register.
+
+ \param [in] faultMask Fault Mask value to set
+ */
+__STATIC_INLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+ register uint32_t __regFaultMask __ASM("faultmask");
+ __regFaultMask = (faultMask & (uint32_t)1);
+}
+
+#endif /* (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300) */
+
+
+#if (__CORTEX_M == 0x04) || (__CORTEX_M == 0x07)
+
+/** \brief Get FPSCR
+
+ This function returns the current value of the Floating Point Status/Control register.
+
+ \return Floating Point Status/Control register value
+ */
+__STATIC_INLINE uint32_t __get_FPSCR(void)
+{
+#if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
+ register uint32_t __regfpscr __ASM("fpscr");
+ return(__regfpscr);
+#else
+ return(0);
+#endif
+}
+
+
+/** \brief Set FPSCR
+
+ This function assigns the given value to the Floating Point Status/Control register.
+
+ \param [in] fpscr Floating Point Status/Control value to set
+ */
+__STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
+ register uint32_t __regfpscr __ASM("fpscr");
+ __regfpscr = (fpscr);
+#endif
+}
+
+#endif /* (__CORTEX_M == 0x04) || (__CORTEX_M == 0x07) */
+
+
+#elif defined ( __GNUC__ ) /*------------------ GNU Compiler ---------------------*/
+/* GNU gcc specific functions */
+
+/** \brief Enable IRQ Interrupts
+
+ This function enables IRQ interrupts by clearing the I-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __enable_irq(void)
+{
+ __ASM volatile ("cpsie i" : : : "memory");
+}
+
+
+/** \brief Disable IRQ Interrupts
+
+ This function disables IRQ interrupts by setting the I-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __disable_irq(void)
+{
+ __ASM volatile ("cpsid i" : : : "memory");
+}
+
+
+/** \brief Get Control Register
+
+ This function returns the content of the Control Register.
+
+ \return Control Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_CONTROL(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, control" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Control Register
+
+ This function writes the given value to the Control Register.
+
+ \param [in] control Control Register value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_CONTROL(uint32_t control)
+{
+ __ASM volatile ("MSR control, %0" : : "r" (control) : "memory");
+}
+
+
+/** \brief Get IPSR Register
+
+ This function returns the content of the IPSR Register.
+
+ \return IPSR Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_IPSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, ipsr" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Get APSR Register
+
+ This function returns the content of the APSR Register.
+
+ \return APSR Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_APSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, apsr" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Get xPSR Register
+
+ This function returns the content of the xPSR Register.
+
+ \return xPSR Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_xPSR(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, xpsr" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Get Process Stack Pointer
+
+ This function returns the current value of the Process Stack Pointer (PSP).
+
+ \return PSP Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_PSP(void)
+{
+ register uint32_t result;
+
+ __ASM volatile ("MRS %0, psp\n" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Process Stack Pointer
+
+ This function assigns the given value to the Process Stack Pointer (PSP).
+
+ \param [in] topOfProcStack Process Stack Pointer value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_PSP(uint32_t topOfProcStack)
+{
+ __ASM volatile ("MSR psp, %0\n" : : "r" (topOfProcStack) : "sp");
+}
+
+
+/** \brief Get Main Stack Pointer
+
+ This function returns the current value of the Main Stack Pointer (MSP).
+
+ \return MSP Register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_MSP(void)
+{
+ register uint32_t result;
+
+ __ASM volatile ("MRS %0, msp\n" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Main Stack Pointer
+
+ This function assigns the given value to the Main Stack Pointer (MSP).
+
+ \param [in] topOfMainStack Main Stack Pointer value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_MSP(uint32_t topOfMainStack)
+{
+ __ASM volatile ("MSR msp, %0\n" : : "r" (topOfMainStack) : "sp");
+}
+
+
+/** \brief Get Priority Mask
+
+ This function returns the current state of the priority mask bit from the Priority Mask Register.
+
+ \return Priority Mask value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_PRIMASK(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, primask" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Priority Mask
+
+ This function assigns the given value to the Priority Mask Register.
+
+ \param [in] priMask Priority Mask
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_PRIMASK(uint32_t priMask)
+{
+ __ASM volatile ("MSR primask, %0" : : "r" (priMask) : "memory");
+}
+
+
+#if (__CORTEX_M >= 0x03)
+
+/** \brief Enable FIQ
+
+ This function enables FIQ interrupts by clearing the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __enable_fault_irq(void)
+{
+ __ASM volatile ("cpsie f" : : : "memory");
+}
+
+
+/** \brief Disable FIQ
+
+ This function disables FIQ interrupts by setting the F-bit in the CPSR.
+ Can only be executed in Privileged modes.
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __disable_fault_irq(void)
+{
+ __ASM volatile ("cpsid f" : : : "memory");
+}
+
+
+/** \brief Get Base Priority
+
+ This function returns the current value of the Base Priority register.
+
+ \return Base Priority register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_BASEPRI(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, basepri" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Base Priority
+
+ This function assigns the given value to the Base Priority register.
+
+ \param [in] basePri Base Priority value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_BASEPRI(uint32_t value)
+{
+ __ASM volatile ("MSR basepri, %0" : : "r" (value) : "memory");
+}
+
+
+/** \brief Set Base Priority with condition
+
+ This function assigns the given value to the Base Priority register only if BASEPRI masking is disabled,
+ or the new value increases the BASEPRI priority level.
+
+ \param [in] basePri Base Priority value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_BASEPRI_MAX(uint32_t value)
+{
+ __ASM volatile ("MSR basepri_max, %0" : : "r" (value) : "memory");
+}
+
+
+/** \brief Get Fault Mask
+
+ This function returns the current value of the Fault Mask register.
+
+ \return Fault Mask register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_FAULTMASK(void)
+{
+ uint32_t result;
+
+ __ASM volatile ("MRS %0, faultmask" : "=r" (result) );
+ return(result);
+}
+
+
+/** \brief Set Fault Mask
+
+ This function assigns the given value to the Fault Mask register.
+
+ \param [in] faultMask Fault Mask value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_FAULTMASK(uint32_t faultMask)
+{
+ __ASM volatile ("MSR faultmask, %0" : : "r" (faultMask) : "memory");
+}
+
+#endif /* (__CORTEX_M >= 0x03) */
+
+
+#if (__CORTEX_M == 0x04) || (__CORTEX_M == 0x07)
+
+/** \brief Get FPSCR
+
+ This function returns the current value of the Floating Point Status/Control register.
+
+ \return Floating Point Status/Control register value
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __get_FPSCR(void)
+{
+#if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
+ uint32_t result;
+
+ /* Empty asm statement works as a scheduling barrier */
+ __ASM volatile ("");
+ __ASM volatile ("VMRS %0, fpscr" : "=r" (result) );
+ __ASM volatile ("");
+ return(result);
+#else
+ return(0);
+#endif
+}
+
+
+/** \brief Set FPSCR
+
+ This function assigns the given value to the Floating Point Status/Control register.
+
+ \param [in] fpscr Floating Point Status/Control value to set
+ */
+__attribute__( ( always_inline ) ) __STATIC_INLINE void __set_FPSCR(uint32_t fpscr)
+{
+#if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
+ /* Empty asm statement works as a scheduling barrier */
+ __ASM volatile ("");
+ __ASM volatile ("VMSR fpscr, %0" : : "r" (fpscr) : "vfpcc");
+ __ASM volatile ("");
+#endif
+}
+
+#endif /* (__CORTEX_M == 0x04) || (__CORTEX_M == 0x07) */
+
+
+#elif defined ( __ICCARM__ ) /*------------------ ICC Compiler -------------------*/
+/* IAR iccarm specific functions */
+#include
+
+
+#elif defined ( __TMS470__ ) /*---------------- TI CCS Compiler ------------------*/
+/* TI CCS specific functions */
+#include
+
+
+#elif defined ( __TASKING__ ) /*------------------ TASKING Compiler --------------*/
+/* TASKING carm specific functions */
+/*
+ * The CMSIS functions have been implemented as intrinsics in the compiler.
+ * Please use "carm -?i" to get an up to date list of all intrinsics,
+ * Including the CMSIS ones.
+ */
+
+
+#elif defined ( __CSMC__ ) /*------------------ COSMIC Compiler -------------------*/
+/* Cosmic specific functions */
+#include
+
+#endif
+
+/*@} end of CMSIS_Core_RegAccFunctions */
+
+#endif /* __CORE_CMFUNC_H */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmInstr.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmInstr.h
new file mode 100644
index 0000000000..fca425c51d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmInstr.h
@@ -0,0 +1,916 @@
+/**************************************************************************//**
+ * @file core_cmInstr.h
+ * @brief CMSIS Cortex-M Core Instruction Access Header File
+ * @version V4.10
+ * @date 18. March 2015
+ *
+ * @note
+ *
+ ******************************************************************************/
+/* Copyright (c) 2009 - 2014 ARM LIMITED
+
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of ARM nor the names of its contributors may be used
+ to endorse or promote products derived from this software without
+ specific prior written permission.
+ *
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS AND CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------*/
+
+
+#ifndef __CORE_CMINSTR_H
+#define __CORE_CMINSTR_H
+
+
+/* ########################## Core Instruction Access ######################### */
+/** \defgroup CMSIS_Core_InstructionInterface CMSIS Core Instruction Interface
+ Access to dedicated instructions
+ @{
+*/
+
+#if defined ( __CC_ARM ) /*------------------RealView Compiler -----------------*/
+/* ARM armcc specific functions */
+
+#if (__ARMCC_VERSION < 400677)
+ #error "Please use ARM Compiler Toolchain V4.0.677 or later!"
+#endif
+
+
+/** \brief No Operation
+
+ No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+#define __NOP __nop
+
+
+/** \brief Wait For Interrupt
+
+ Wait For Interrupt is a hint instruction that suspends execution
+ until one of a number of events occurs.
+ */
+#define __WFI __wfi
+
+
+/** \brief Wait For Event
+
+ Wait For Event is a hint instruction that permits the processor to enter
+ a low-power state until one of a number of events occurs.
+ */
+#define __WFE __wfe
+
+
+/** \brief Send Event
+
+ Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+#define __SEV __sev
+
+
+/** \brief Instruction Synchronization Barrier
+
+ Instruction Synchronization Barrier flushes the pipeline in the processor,
+ so that all instructions following the ISB are fetched from cache or
+ memory, after the instruction has been completed.
+ */
+#define __ISB() do {\
+ __schedule_barrier();\
+ __isb(0xF);\
+ __schedule_barrier();\
+ } while (0)
+
+/** \brief Data Synchronization Barrier
+
+ This function acts as a special kind of Data Memory Barrier.
+ It completes when all explicit memory accesses before this instruction complete.
+ */
+#define __DSB() do {\
+ __schedule_barrier();\
+ __dsb(0xF);\
+ __schedule_barrier();\
+ } while (0)
+
+/** \brief Data Memory Barrier
+
+ This function ensures the apparent order of the explicit memory operations before
+ and after the instruction, without ensuring their completion.
+ */
+#define __DMB() do {\
+ __schedule_barrier();\
+ __dmb(0xF);\
+ __schedule_barrier();\
+ } while (0)
+
+/** \brief Reverse byte order (32 bit)
+
+ This function reverses the byte order in integer value.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#define __REV __rev
+
+
+/** \brief Reverse byte order (16 bit)
+
+ This function reverses the byte order in two unsigned short values.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rev16_text"))) __STATIC_INLINE __ASM uint32_t __REV16(uint32_t value)
+{
+ rev16 r0, r0
+ bx lr
+}
+#endif
+
+/** \brief Reverse byte order in signed short value
+
+ This function reverses the byte order in a signed short value with sign extension to integer.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".revsh_text"))) __STATIC_INLINE __ASM int32_t __REVSH(int32_t value)
+{
+ revsh r0, r0
+ bx lr
+}
+#endif
+
+
+/** \brief Rotate Right in unsigned value (32 bit)
+
+ This function Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+
+ \param [in] value Value to rotate
+ \param [in] value Number of Bits to rotate
+ \return Rotated value
+ */
+#define __ROR __ror
+
+
+/** \brief Breakpoint
+
+ This function causes the processor to enter Debug state.
+ Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+
+ \param [in] value is ignored by the processor.
+ If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value) __breakpoint(value)
+
+
+/** \brief Reverse bit order of value
+
+ This function reverses the bit order of the given value.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+#if (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300)
+ #define __RBIT __rbit
+#else
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __RBIT(uint32_t value)
+{
+ uint32_t result;
+ int32_t s = 4 /*sizeof(v)*/ * 8 - 1; // extra shift needed at end
+
+ result = value; // r will be reversed bits of v; first get LSB of v
+ for (value >>= 1; value; value >>= 1)
+ {
+ result <<= 1;
+ result |= value & 1;
+ s--;
+ }
+ result <<= s; // shift when v's highest bits are zero
+ return(result);
+}
+#endif
+
+
+/** \brief Count leading zeros
+
+ This function counts the number of leading zeros of a data value.
+
+ \param [in] value Value to count the leading zeros
+ \return number of leading zeros in value
+ */
+#define __CLZ __clz
+
+
+#if (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300)
+
+/** \brief LDR Exclusive (8 bit)
+
+ This function executes a exclusive LDR instruction for 8 bit value.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#define __LDREXB(ptr) ((uint8_t ) __ldrex(ptr))
+
+
+/** \brief LDR Exclusive (16 bit)
+
+ This function executes a exclusive LDR instruction for 16 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#define __LDREXH(ptr) ((uint16_t) __ldrex(ptr))
+
+
+/** \brief LDR Exclusive (32 bit)
+
+ This function executes a exclusive LDR instruction for 32 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#define __LDREXW(ptr) ((uint32_t ) __ldrex(ptr))
+
+
+/** \brief STR Exclusive (8 bit)
+
+ This function executes a exclusive STR instruction for 8 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXB(value, ptr) __strex(value, ptr)
+
+
+/** \brief STR Exclusive (16 bit)
+
+ This function executes a exclusive STR instruction for 16 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXH(value, ptr) __strex(value, ptr)
+
+
+/** \brief STR Exclusive (32 bit)
+
+ This function executes a exclusive STR instruction for 32 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+#define __STREXW(value, ptr) __strex(value, ptr)
+
+
+/** \brief Remove the exclusive lock
+
+ This function removes the exclusive lock which is created by LDREX.
+
+ */
+#define __CLREX __clrex
+
+
+/** \brief Signed Saturate
+
+ This function saturates a signed value.
+
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+#define __SSAT __ssat
+
+
+/** \brief Unsigned Saturate
+
+ This function saturates an unsigned value.
+
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+#define __USAT __usat
+
+
+/** \brief Rotate Right with Extend (32 bit)
+
+ This function moves each bit of a bitstring right by one bit.
+ The carry input is shifted in at the left end of the bitstring.
+
+ \param [in] value Value to rotate
+ \return Rotated value
+ */
+#ifndef __NO_EMBEDDED_ASM
+__attribute__((section(".rrx_text"))) __STATIC_INLINE __ASM uint32_t __RRX(uint32_t value)
+{
+ rrx r0, r0
+ bx lr
+}
+#endif
+
+
+/** \brief LDRT Unprivileged (8 bit)
+
+ This function executes a Unprivileged LDRT instruction for 8 bit value.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+#define __LDRBT(ptr) ((uint8_t ) __ldrt(ptr))
+
+
+/** \brief LDRT Unprivileged (16 bit)
+
+ This function executes a Unprivileged LDRT instruction for 16 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+#define __LDRHT(ptr) ((uint16_t) __ldrt(ptr))
+
+
+/** \brief LDRT Unprivileged (32 bit)
+
+ This function executes a Unprivileged LDRT instruction for 32 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+#define __LDRT(ptr) ((uint32_t ) __ldrt(ptr))
+
+
+/** \brief STRT Unprivileged (8 bit)
+
+ This function executes a Unprivileged STRT instruction for 8 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRBT(value, ptr) __strt(value, ptr)
+
+
+/** \brief STRT Unprivileged (16 bit)
+
+ This function executes a Unprivileged STRT instruction for 16 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRHT(value, ptr) __strt(value, ptr)
+
+
+/** \brief STRT Unprivileged (32 bit)
+
+ This function executes a Unprivileged STRT instruction for 32 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+#define __STRT(value, ptr) __strt(value, ptr)
+
+#endif /* (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300) */
+
+
+#elif defined ( __GNUC__ ) /*------------------ GNU Compiler ---------------------*/
+/* GNU gcc specific functions */
+
+/* Define macros for porting to both thumb1 and thumb2.
+ * For thumb1, use low register (r0-r7), specified by constrant "l"
+ * Otherwise, use general registers, specified by constrant "r" */
+#if defined (__thumb__) && !defined (__thumb2__)
+#define __CMSIS_GCC_OUT_REG(r) "=l" (r)
+#define __CMSIS_GCC_USE_REG(r) "l" (r)
+#else
+#define __CMSIS_GCC_OUT_REG(r) "=r" (r)
+#define __CMSIS_GCC_USE_REG(r) "r" (r)
+#endif
+
+/** \brief No Operation
+
+ No Operation does nothing. This instruction can be used for code alignment purposes.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __NOP(void)
+{
+ __ASM volatile ("nop");
+}
+
+
+/** \brief Wait For Interrupt
+
+ Wait For Interrupt is a hint instruction that suspends execution
+ until one of a number of events occurs.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __WFI(void)
+{
+ __ASM volatile ("wfi");
+}
+
+
+/** \brief Wait For Event
+
+ Wait For Event is a hint instruction that permits the processor to enter
+ a low-power state until one of a number of events occurs.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __WFE(void)
+{
+ __ASM volatile ("wfe");
+}
+
+
+/** \brief Send Event
+
+ Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __SEV(void)
+{
+ __ASM volatile ("sev");
+}
+
+
+/** \brief Instruction Synchronization Barrier
+
+ Instruction Synchronization Barrier flushes the pipeline in the processor,
+ so that all instructions following the ISB are fetched from cache or
+ memory, after the instruction has been completed.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __ISB(void)
+{
+ __ASM volatile ("isb 0xF":::"memory");
+}
+
+
+/** \brief Data Synchronization Barrier
+
+ This function acts as a special kind of Data Memory Barrier.
+ It completes when all explicit memory accesses before this instruction complete.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __DSB(void)
+{
+ __ASM volatile ("dsb 0xF":::"memory");
+}
+
+
+/** \brief Data Memory Barrier
+
+ This function ensures the apparent order of the explicit memory operations before
+ and after the instruction, without ensuring their completion.
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __DMB(void)
+{
+ __ASM volatile ("dmb 0xF":::"memory");
+}
+
+
+/** \brief Reverse byte order (32 bit)
+
+ This function reverses the byte order in integer value.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __REV(uint32_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5)
+ return __builtin_bswap32(value);
+#else
+ uint32_t result;
+
+ __ASM volatile ("rev %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+ return(result);
+#endif
+}
+
+
+/** \brief Reverse byte order (16 bit)
+
+ This function reverses the byte order in two unsigned short values.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __REV16(uint32_t value)
+{
+ uint32_t result;
+
+ __ASM volatile ("rev16 %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+ return(result);
+}
+
+
+/** \brief Reverse byte order in signed short value
+
+ This function reverses the byte order in a signed short value with sign extension to integer.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+__attribute__((always_inline)) __STATIC_INLINE int32_t __REVSH(int32_t value)
+{
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+ return (short)__builtin_bswap16(value);
+#else
+ uint32_t result;
+
+ __ASM volatile ("revsh %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+ return(result);
+#endif
+}
+
+
+/** \brief Rotate Right in unsigned value (32 bit)
+
+ This function Rotate Right (immediate) provides the value of the contents of a register rotated by a variable number of bits.
+
+ \param [in] value Value to rotate
+ \param [in] value Number of Bits to rotate
+ \return Rotated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __ROR(uint32_t op1, uint32_t op2)
+{
+ return (op1 >> op2) | (op1 << (32 - op2));
+}
+
+
+/** \brief Breakpoint
+
+ This function causes the processor to enter Debug state.
+ Debug tools can use this to investigate system state when the instruction at a particular address is reached.
+
+ \param [in] value is ignored by the processor.
+ If required, a debugger can use it to store additional information about the breakpoint.
+ */
+#define __BKPT(value) __ASM volatile ("bkpt "#value)
+
+
+/** \brief Reverse bit order of value
+
+ This function reverses the bit order of the given value.
+
+ \param [in] value Value to reverse
+ \return Reversed value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __RBIT(uint32_t value)
+{
+ uint32_t result;
+
+#if (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300)
+ __ASM volatile ("rbit %0, %1" : "=r" (result) : "r" (value) );
+#else
+ int32_t s = 4 /*sizeof(v)*/ * 8 - 1; // extra shift needed at end
+
+ result = value; // r will be reversed bits of v; first get LSB of v
+ for (value >>= 1; value; value >>= 1)
+ {
+ result <<= 1;
+ result |= value & 1;
+ s--;
+ }
+ result <<= s; // shift when v's highest bits are zero
+#endif
+ return(result);
+}
+
+
+/** \brief Count leading zeros
+
+ This function counts the number of leading zeros of a data value.
+
+ \param [in] value Value to count the leading zeros
+ \return number of leading zeros in value
+ */
+#define __CLZ __builtin_clz
+
+
+#if (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300)
+
+/** \brief LDR Exclusive (8 bit)
+
+ This function executes a exclusive LDR instruction for 8 bit value.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint8_t __LDREXB(volatile uint8_t *addr)
+{
+ uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+ __ASM volatile ("ldrexb %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+ /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+ accepted by assembler. So has to use following less efficient pattern.
+ */
+ __ASM volatile ("ldrexb %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+ return ((uint8_t) result); /* Add explicit type cast here */
+}
+
+
+/** \brief LDR Exclusive (16 bit)
+
+ This function executes a exclusive LDR instruction for 16 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint16_t __LDREXH(volatile uint16_t *addr)
+{
+ uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+ __ASM volatile ("ldrexh %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+ /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+ accepted by assembler. So has to use following less efficient pattern.
+ */
+ __ASM volatile ("ldrexh %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+ return ((uint16_t) result); /* Add explicit type cast here */
+}
+
+
+/** \brief LDR Exclusive (32 bit)
+
+ This function executes a exclusive LDR instruction for 32 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __LDREXW(volatile uint32_t *addr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldrex %0, %1" : "=r" (result) : "Q" (*addr) );
+ return(result);
+}
+
+
+/** \brief STR Exclusive (8 bit)
+
+ This function executes a exclusive STR instruction for 8 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __STREXB(uint8_t value, volatile uint8_t *addr)
+{
+ uint32_t result;
+
+ __ASM volatile ("strexb %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+ return(result);
+}
+
+
+/** \brief STR Exclusive (16 bit)
+
+ This function executes a exclusive STR instruction for 16 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __STREXH(uint16_t value, volatile uint16_t *addr)
+{
+ uint32_t result;
+
+ __ASM volatile ("strexh %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" ((uint32_t)value) );
+ return(result);
+}
+
+
+/** \brief STR Exclusive (32 bit)
+
+ This function executes a exclusive STR instruction for 32 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ \return 0 Function succeeded
+ \return 1 Function failed
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __STREXW(uint32_t value, volatile uint32_t *addr)
+{
+ uint32_t result;
+
+ __ASM volatile ("strex %0, %2, %1" : "=&r" (result), "=Q" (*addr) : "r" (value) );
+ return(result);
+}
+
+
+/** \brief Remove the exclusive lock
+
+ This function removes the exclusive lock which is created by LDREX.
+
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __CLREX(void)
+{
+ __ASM volatile ("clrex" ::: "memory");
+}
+
+
+/** \brief Signed Saturate
+
+ This function saturates a signed value.
+
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (1..32)
+ \return Saturated value
+ */
+#define __SSAT(ARG1,ARG2) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("ssat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+
+/** \brief Unsigned Saturate
+
+ This function saturates an unsigned value.
+
+ \param [in] value Value to be saturated
+ \param [in] sat Bit position to saturate to (0..31)
+ \return Saturated value
+ */
+#define __USAT(ARG1,ARG2) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("usat %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+
+/** \brief Rotate Right with Extend (32 bit)
+
+ This function moves each bit of a bitstring right by one bit.
+ The carry input is shifted in at the left end of the bitstring.
+
+ \param [in] value Value to rotate
+ \return Rotated value
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __RRX(uint32_t value)
+{
+ uint32_t result;
+
+ __ASM volatile ("rrx %0, %1" : __CMSIS_GCC_OUT_REG (result) : __CMSIS_GCC_USE_REG (value) );
+ return(result);
+}
+
+
+/** \brief LDRT Unprivileged (8 bit)
+
+ This function executes a Unprivileged LDRT instruction for 8 bit value.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint8_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint8_t __LDRBT(volatile uint8_t *addr)
+{
+ uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+ __ASM volatile ("ldrbt %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+ /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+ accepted by assembler. So has to use following less efficient pattern.
+ */
+ __ASM volatile ("ldrbt %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+ return ((uint8_t) result); /* Add explicit type cast here */
+}
+
+
+/** \brief LDRT Unprivileged (16 bit)
+
+ This function executes a Unprivileged LDRT instruction for 16 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint16_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint16_t __LDRHT(volatile uint16_t *addr)
+{
+ uint32_t result;
+
+#if (__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)
+ __ASM volatile ("ldrht %0, %1" : "=r" (result) : "Q" (*addr) );
+#else
+ /* Prior to GCC 4.8, "Q" will be expanded to [rx, #0] which is not
+ accepted by assembler. So has to use following less efficient pattern.
+ */
+ __ASM volatile ("ldrht %0, [%1]" : "=r" (result) : "r" (addr) : "memory" );
+#endif
+ return ((uint16_t) result); /* Add explicit type cast here */
+}
+
+
+/** \brief LDRT Unprivileged (32 bit)
+
+ This function executes a Unprivileged LDRT instruction for 32 bit values.
+
+ \param [in] ptr Pointer to data
+ \return value of type uint32_t at (*ptr)
+ */
+__attribute__((always_inline)) __STATIC_INLINE uint32_t __LDRT(volatile uint32_t *addr)
+{
+ uint32_t result;
+
+ __ASM volatile ("ldrt %0, %1" : "=r" (result) : "Q" (*addr) );
+ return(result);
+}
+
+
+/** \brief STRT Unprivileged (8 bit)
+
+ This function executes a Unprivileged STRT instruction for 8 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __STRBT(uint8_t value, volatile uint8_t *addr)
+{
+ __ASM volatile ("strbt %1, %0" : "=Q" (*addr) : "r" ((uint32_t)value) );
+}
+
+
+/** \brief STRT Unprivileged (16 bit)
+
+ This function executes a Unprivileged STRT instruction for 16 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __STRHT(uint16_t value, volatile uint16_t *addr)
+{
+ __ASM volatile ("strht %1, %0" : "=Q" (*addr) : "r" ((uint32_t)value) );
+}
+
+
+/** \brief STRT Unprivileged (32 bit)
+
+ This function executes a Unprivileged STRT instruction for 32 bit values.
+
+ \param [in] value Value to store
+ \param [in] ptr Pointer to location
+ */
+__attribute__((always_inline)) __STATIC_INLINE void __STRT(uint32_t value, volatile uint32_t *addr)
+{
+ __ASM volatile ("strt %1, %0" : "=Q" (*addr) : "r" (value) );
+}
+
+#endif /* (__CORTEX_M >= 0x03) || (__CORTEX_SC >= 300) */
+
+
+#elif defined ( __ICCARM__ ) /*------------------ ICC Compiler -------------------*/
+/* IAR iccarm specific functions */
+#include
+
+
+#elif defined ( __TMS470__ ) /*---------------- TI CCS Compiler ------------------*/
+/* TI CCS specific functions */
+#include
+
+
+#elif defined ( __TASKING__ ) /*------------------ TASKING Compiler --------------*/
+/* TASKING carm specific functions */
+/*
+ * The CMSIS functions have been implemented as intrinsics in the compiler.
+ * Please use "carm -?i" to get an up to date list of all intrinsics,
+ * Including the CMSIS ones.
+ */
+
+
+#elif defined ( __CSMC__ ) /*------------------ COSMIC Compiler -------------------*/
+/* Cosmic specific functions */
+#include
+
+#endif
+
+/*@}*/ /* end of group CMSIS_Core_InstructionInterface */
+
+#endif /* __CORE_CMINSTR_H */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmSimd.h b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmSimd.h
new file mode 100644
index 0000000000..7b8e37fff6
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/Include/core_cmSimd.h
@@ -0,0 +1,697 @@
+/**************************************************************************//**
+ * @file core_cmSimd.h
+ * @brief CMSIS Cortex-M SIMD Header File
+ * @version V4.10
+ * @date 18. March 2015
+ *
+ * @note
+ *
+ ******************************************************************************/
+/* Copyright (c) 2009 - 2014 ARM LIMITED
+
+ All rights reserved.
+ Redistribution and use in source and binary forms, with or without
+ modification, are permitted provided that the following conditions are met:
+ - Redistributions of source code must retain the above copyright
+ notice, this list of conditions and the following disclaimer.
+ - Redistributions in binary form must reproduce the above copyright
+ notice, this list of conditions and the following disclaimer in the
+ documentation and/or other materials provided with the distribution.
+ - Neither the name of ARM nor the names of its contributors may be used
+ to endorse or promote products derived from this software without
+ specific prior written permission.
+ *
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ ARE DISCLAIMED. IN NO EVENT SHALL COPYRIGHT HOLDERS AND CONTRIBUTORS BE
+ LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+ CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+ SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+ INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+ CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+ POSSIBILITY OF SUCH DAMAGE.
+ ---------------------------------------------------------------------------*/
+
+
+#if defined ( __ICCARM__ )
+ #pragma system_include /* treat file as system include file for MISRA check */
+#endif
+
+#ifndef __CORE_CMSIMD_H
+#define __CORE_CMSIMD_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+
+/*******************************************************************************
+ * Hardware Abstraction Layer
+ ******************************************************************************/
+
+
+/* ################### Compiler specific Intrinsics ########################### */
+/** \defgroup CMSIS_SIMD_intrinsics CMSIS SIMD Intrinsics
+ Access to dedicated SIMD instructions
+ @{
+*/
+
+#if defined ( __CC_ARM ) /*------------------RealView Compiler -----------------*/
+/* ARM armcc specific functions */
+#define __SADD8 __sadd8
+#define __QADD8 __qadd8
+#define __SHADD8 __shadd8
+#define __UADD8 __uadd8
+#define __UQADD8 __uqadd8
+#define __UHADD8 __uhadd8
+#define __SSUB8 __ssub8
+#define __QSUB8 __qsub8
+#define __SHSUB8 __shsub8
+#define __USUB8 __usub8
+#define __UQSUB8 __uqsub8
+#define __UHSUB8 __uhsub8
+#define __SADD16 __sadd16
+#define __QADD16 __qadd16
+#define __SHADD16 __shadd16
+#define __UADD16 __uadd16
+#define __UQADD16 __uqadd16
+#define __UHADD16 __uhadd16
+#define __SSUB16 __ssub16
+#define __QSUB16 __qsub16
+#define __SHSUB16 __shsub16
+#define __USUB16 __usub16
+#define __UQSUB16 __uqsub16
+#define __UHSUB16 __uhsub16
+#define __SASX __sasx
+#define __QASX __qasx
+#define __SHASX __shasx
+#define __UASX __uasx
+#define __UQASX __uqasx
+#define __UHASX __uhasx
+#define __SSAX __ssax
+#define __QSAX __qsax
+#define __SHSAX __shsax
+#define __USAX __usax
+#define __UQSAX __uqsax
+#define __UHSAX __uhsax
+#define __USAD8 __usad8
+#define __USADA8 __usada8
+#define __SSAT16 __ssat16
+#define __USAT16 __usat16
+#define __UXTB16 __uxtb16
+#define __UXTAB16 __uxtab16
+#define __SXTB16 __sxtb16
+#define __SXTAB16 __sxtab16
+#define __SMUAD __smuad
+#define __SMUADX __smuadx
+#define __SMLAD __smlad
+#define __SMLADX __smladx
+#define __SMLALD __smlald
+#define __SMLALDX __smlaldx
+#define __SMUSD __smusd
+#define __SMUSDX __smusdx
+#define __SMLSD __smlsd
+#define __SMLSDX __smlsdx
+#define __SMLSLD __smlsld
+#define __SMLSLDX __smlsldx
+#define __SEL __sel
+#define __QADD __qadd
+#define __QSUB __qsub
+
+#define __PKHBT(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0x0000FFFFUL) | \
+ ((((uint32_t)(ARG2)) << (ARG3)) & 0xFFFF0000UL) )
+
+#define __PKHTB(ARG1,ARG2,ARG3) ( ((((uint32_t)(ARG1)) ) & 0xFFFF0000UL) | \
+ ((((uint32_t)(ARG2)) >> (ARG3)) & 0x0000FFFFUL) )
+
+#define __SMMLA(ARG1,ARG2,ARG3) ( (int32_t)((((int64_t)(ARG1) * (ARG2)) + \
+ ((int64_t)(ARG3) << 32) ) >> 32))
+
+
+#elif defined ( __GNUC__ ) /*------------------ GNU Compiler ---------------------*/
+/* GNU gcc specific functions */
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHADD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhadd8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __USUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHSUB8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsub8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHADD16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhadd16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __USUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHSUB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsub16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHASX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhasx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("ssax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SHSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("shsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __USAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UQSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uqsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UHSAX(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uhsax %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __USAD8(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("usad8 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __USADA8(uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("usada8 %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+#define __SSAT16(ARG1,ARG2) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("ssat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+#define __USAT16(ARG1,ARG2) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1); \
+ __ASM ("usat16 %0, %1, %2" : "=r" (__RES) : "I" (ARG2), "r" (__ARG1) ); \
+ __RES; \
+ })
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UXTB16(uint32_t op1)
+{
+ uint32_t result;
+
+ __ASM volatile ("uxtb16 %0, %1" : "=r" (result) : "r" (op1));
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __UXTAB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("uxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SXTB16(uint32_t op1)
+{
+ uint32_t result;
+
+ __ASM volatile ("sxtb16 %0, %1" : "=r" (result) : "r" (op1));
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SXTAB16(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sxtab16 %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMUAD (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smuad %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMUADX (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smuadx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMLAD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlad %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMLADX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smladx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ // Little endian
+ __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else // Big endian
+ __ASM volatile ("smlald %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLALDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ // Little endian
+ __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else // Big endian
+ __ASM volatile ("smlaldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMUSD (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smusd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMUSDX (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("smusdx %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMLSD (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlsd %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMLSDX (uint32_t op1, uint32_t op2, uint32_t op3)
+{
+ uint32_t result;
+
+ __ASM volatile ("smlsdx %0, %1, %2, %3" : "=r" (result) : "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLSLD (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ // Little endian
+ __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else // Big endian
+ __ASM volatile ("smlsld %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint64_t __SMLSLDX (uint32_t op1, uint32_t op2, uint64_t acc)
+{
+ union llreg_u{
+ uint32_t w32[2];
+ uint64_t w64;
+ } llr;
+ llr.w64 = acc;
+
+#ifndef __ARMEB__ // Little endian
+ __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[0]), "=r" (llr.w32[1]): "r" (op1), "r" (op2) , "0" (llr.w32[0]), "1" (llr.w32[1]) );
+#else // Big endian
+ __ASM volatile ("smlsldx %0, %1, %2, %3" : "=r" (llr.w32[1]), "=r" (llr.w32[0]): "r" (op1), "r" (op2) , "0" (llr.w32[1]), "1" (llr.w32[0]) );
+#endif
+
+ return(llr.w64);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SEL (uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("sel %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QADD(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qadd %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __QSUB(uint32_t op1, uint32_t op2)
+{
+ uint32_t result;
+
+ __ASM volatile ("qsub %0, %1, %2" : "=r" (result) : "r" (op1), "r" (op2) );
+ return(result);
+}
+
+#define __PKHBT(ARG1,ARG2,ARG3) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+ __ASM ("pkhbt %0, %1, %2, lsl %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
+ __RES; \
+ })
+
+#define __PKHTB(ARG1,ARG2,ARG3) \
+({ \
+ uint32_t __RES, __ARG1 = (ARG1), __ARG2 = (ARG2); \
+ if (ARG3 == 0) \
+ __ASM ("pkhtb %0, %1, %2" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2) ); \
+ else \
+ __ASM ("pkhtb %0, %1, %2, asr %3" : "=r" (__RES) : "r" (__ARG1), "r" (__ARG2), "I" (ARG3) ); \
+ __RES; \
+ })
+
+__attribute__( ( always_inline ) ) __STATIC_INLINE uint32_t __SMMLA (int32_t op1, int32_t op2, int32_t op3)
+{
+ int32_t result;
+
+ __ASM volatile ("smmla %0, %1, %2, %3" : "=r" (result): "r" (op1), "r" (op2), "r" (op3) );
+ return(result);
+}
+
+
+#elif defined ( __ICCARM__ ) /*------------------ ICC Compiler -------------------*/
+/* IAR iccarm specific functions */
+#include
+
+
+#elif defined ( __TMS470__ ) /*---------------- TI CCS Compiler ------------------*/
+/* TI CCS specific functions */
+#include
+
+
+#elif defined ( __TASKING__ ) /*------------------ TASKING Compiler --------------*/
+/* TASKING carm specific functions */
+/* not yet supported */
+
+
+#elif defined ( __CSMC__ ) /*------------------ COSMIC Compiler -------------------*/
+/* Cosmic specific functions */
+#include
+
+#endif
+
+/*@} end of group CMSIS_SIMD_intrinsics */
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __CORE_CMSIMD_H */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/LICENSE.txt b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/LICENSE.txt
new file mode 100644
index 0000000000..f0cd2d9ccf
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/CMSIS/LICENSE.txt
@@ -0,0 +1,176 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
\ No newline at end of file
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/apm32f4xx.h b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/apm32f4xx.h
new file mode 100644
index 0000000000..c859ea2fb5
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/apm32f4xx.h
@@ -0,0 +1,7492 @@
+/*!
+ * @file apm32f4xx.h
+ *
+ * @brief CMSIS Cortex-M4 Device Peripheral Access Layer Header File.
+ *
+ * @details This file contains all the peripheral register's definitions, bits definitions and memory mapping
+ *
+ * @version V1.0.2
+ *
+ * @date 2022-06-23
+ *
+ * @attention
+ *
+ * Copyright (C) 2021-2022 Geehy Semiconductor
+ *
+ * You may not use this file except in compliance with the
+ * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+ *
+ * The program is only for reference, which is distributed in the hope
+ * that it will be usefull and instructional for customers to develop
+ * their software. Unless required by applicable law or agreed to in
+ * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+ * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+ * and limitations under the License.
+ */
+
+#ifndef __APM32F4XX_H
+#define __APM32F4XX_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif /* __cplusplus */
+
+/** @addtogroup CMSIS
+ @{
+*/
+
+/** @defgroup APM32F4xx
+ * @brief Peripheral Access Layer
+ @{
+*/
+
+/** @defgroup HSE_Macros
+ @{
+*/
+
+/**
+ * @brief Define Value of the External oscillator in Hz
+ */
+#ifndef HSE_VALUE
+#define HSE_VALUE ((uint32_t)8000000)
+#endif
+
+/** Time out for HSE start up */
+#define HSE_STARTUP_TIMEOUT ((uint16_t)0x05000)
+
+/** Value of the Internal oscillator in Hz */
+#define HSI_VALUE ((uint32_t)16000000)
+
+/**
+ * @}
+ */
+
+/** @defgroup APM32F4xx_StdPeripheral_Library_Version
+ @{
+*/
+
+/**
+ * @brief Library_Version_Number_Macros
+ */
+#define __APM32F4XX_STDPERIPH_VERSION_MAIN (0x01) /*!< [31:24] main version */
+#define __APM32F4XX_STDPERIPH_VERSION_SUB1 (0x00) /*!< [23:16] sub1 version */
+#define __APM32F4XX_STDPERIPH_VERSION_SUB2 (0x02) /*!< [15:8] sub2 version */
+#define __APM32F4XX_STDPERIPH_VERSION_RC (0x00) /*!< [7:0] release candidate */
+#define __APM32F4XX_STDPERIPH_VERSION ((__APM32F4XX_STDPERIPH_VERSION_MAIN << 24)\
+ |(__APM32F4XX_STDPERIPH_VERSION_SUB1 << 16)\
+ |(__APM32F4XX_STDPERIPH_VERSION_SUB2 << 8)\
+ |(__APM32F4XX_STDPERIPH_VERSION_RC))
+
+/**
+ * @}
+ */
+
+/** @defgroup Configuraion_for_CMSIS
+ @{
+*/
+
+/** Core revision r0p1 */
+#define __CM4_REV 0x0001
+/** APM32 devices provides an MPU */
+#define __MPU_PRESENT 1
+/** APM32 uses 4 Bits for the Priority Levels */
+#define __NVIC_PRIO_BITS 4
+/** Set to 1 if different SysTick Config is used */
+#define __Vendor_SysTickConfig 0
+/** APM32 devices provides an FPU */
+#define __FPU_PRESENT 1
+
+/**
+ * @brief APM32F4xx Interrupt Number Definition, according to the selected device
+ * in @ref Library_configuration_section
+ */
+typedef enum IRQn
+{
+ /****** Cortex-M4 Processor Exceptions Numbers ****************************************************************/
+ NonMaskableInt_IRQn = -14, /*!< 2 Non Maskable Interrupt */
+ MemoryManagement_IRQn = -12, /*!< 4 Cortex-M3 Memory Management Interrupt */
+ BusFault_IRQn = -11, /*!< 5 Cortex-M3 Bus Fault Interrupt */
+ UsageFault_IRQn = -10, /*!< 6 Cortex-M3 Usage Fault Interrupt */
+ SVCall_IRQn = -5, /*!< 11 Cortex-M3 SV Call Interrupt */
+ DebugMonitor_IRQn = -4, /*!< 12 Cortex-M3 Debug Monitor Interrupt */
+ PendSV_IRQn = -2, /*!< 14 Cortex-M3 Pend SV Interrupt */
+ SysTick_IRQn = -1, /*!< 15 Cortex-M3 System Tick Interrupt */
+
+ /****** APM32 specific Interrupt Numbers **********************************************************************/
+ WWDT_IRQn = 0, /*!< Window WatchDog Interrupt */
+ PVD_IRQn = 1, /*!< PVD through EINT Line detection Interrupt */
+ TAMP_STAMP_IRQn = 2, /*!< Tamper Interrupt */
+ RTC_WKUP_IRQn = 3, /*!< RTC global Interrupt */
+ FLASH_IRQn = 4, /*!< FLASH global Interrupt */
+ RCM_IRQn = 5, /*!< RCM global Interrupt */
+ EINT0_IRQn = 6, /*!< EINT Line0 Interrupt */
+ EINT1_IRQn = 7, /*!< EINT Line1 Interrupt */
+ EINT2_IRQn = 8, /*!< EINT Line2 Interrupt */
+ EINT3_IRQn = 9, /*!< EINT Line3 Interrupt */
+ EINT4_IRQn = 10, /*!< EINT Line4 Interrupt */
+ DMA1_STR0_IRQn = 11, /*!< DMA1 Stream 1 global Interrupt */
+ DMA1_STR1_IRQn = 12, /*!< DMA1 Stream 1 global Interrupt */
+ DMA1_STR2_IRQn = 13, /*!< DMA1 Stream 2 global Interrupt */
+ DMA1_STR3_IRQn = 14, /*!< DMA1 Stream 3 global Interrupt */
+ DMA1_STR4_IRQn = 15, /*!< DMA1 Stream 4 global Interrupt */
+ DMA1_STR5_IRQn = 16, /*!< DMA1 Stream 5 global Interrupt */
+ DMA1_STR6_IRQn = 17, /*!< DMA1 Stream 6 global Interrupt */
+
+#if defined(APM32F40X)
+ ADC_IRQn = 18, /*!< ADC Interrupt */
+ CAN1_TX_IRQn = 19, /*!< CAN1 TX Interrupt */
+ CAN1_RX0_IRQn = 20, /*!< CAN1 RX0 Interrupt */
+ CAN1_RX1_IRQn = 21, /*!< CAN1 RX1 Interrupt */
+ CAN1_SCE_IRQn = 22, /*!< CAN1 SCE Interrupt */
+ EINT9_5_IRQn = 23, /*!< External Line[9:5] Interrupts */
+ TMR1_BRK_TMR9_IRQn = 24, /*!< TMR1 Break interrupt and TMR9 global interrupt */
+ TMR1_UP_TMR10_IRQn = 25, /*!< TMR1 Update Interrupt and TMR10 global interrupt */
+ TMR1_TRG_COM_TMR11_IRQn = 26, /*!< TMR1 Trigger and Commutation Interrupt and TMR11 global interrupt */
+ TMR1_CC_IRQn = 27, /*!< TMR1 Capture Compare Interrupt */
+ TMR2_IRQn = 28, /*!< TMR2 global Interrupt */
+ TMR3_IRQn = 29, /*!< TMR3 global Interrupt */
+ TMR4_IRQn = 30, /*!< TMR4 global Interrupt */
+ I2C1_EV_IRQn = 31, /*!< I2C1 Event Interrupt */
+ I2C1_ER_IRQn = 32, /*!< I2C1 Error Interrupt */
+ I2C2_EV_IRQn = 33, /*!< I2C2 Event Interrupt */
+ I2C2_ER_IRQn = 34, /*!< I2C2 Error Interrupt */
+ SPI1_IRQn = 35, /*!< SPI1 global Interrupt */
+ SPI2_IRQn = 36, /*!< SPI2 global Interrupt */
+ USART1_IRQn = 37, /*!< USART1 global Interrupt */
+ USART2_IRQn = 38, /*!< USART2 global Interrupt */
+ USART3_IRQn = 39, /*!< USART3 global Interrupt */
+ EINT15_10_IRQn = 40, /*!< External Line[15:10] Interrupts */
+ RTC_Alarm_IRQn = 41, /*!< RTC Alarm (A and B) through EINT Line Interrupt */
+ OTG_FS_WKUP_IRQn = 42, /*!< OTG_FS Wakeup through EINT line interrupt */
+ TMR8_BRK_TMR12_IRQn = 43, /*!< TMR8 Break Interrupt and TMR12 global interrupt */
+ TMR8_UP_TMR13_IRQn = 44, /*!< TMR8 Update Interrupt and TMR13 global interrupt */
+ TMR8_TRG_COM_TMR14_IRQn = 45, /*!< TMR8 Trigger and Commutation Interrupt and TMR14 global interrupt */
+ TMR8_CC_IRQn = 46, /*!< TMR8 Capture Compare Interrupt */
+ DMA1_STR7_IRQn = 47, /*!< DMA1 Stream 7 Interrupt */
+ EMMC_IRQn = 48, /*!< FSMC global Interrupt */
+ SDIO_IRQn = 49, /*!< SDIO global Interrupt */
+ TMR5_IRQn = 50, /*!< TMR5 global Interrupt */
+ SPI3_IRQn = 51, /*!< SPI3 global Interrupt */
+ UART4_IRQn = 52, /*!< UART4 global Interrupt */
+ UART5_IRQn = 53, /*!< UART5 global Interrupt */
+ TMR6_DAC_IRQn = 54, /*!< TMR6 global and DAC1&2 underrun error interrupts */
+ TMR7_IRQn = 55, /*!< TMR7 global interrupt */
+ DMA2_STR0_IRQn = 56, /*!< DMA2 Stream 0 global Interrupt */
+ DMA2_STR1_IRQn = 57, /*!< DMA2 Stream 1 global Interrupt */
+ DMA2_STR2_IRQn = 58, /*!< DMA2 Stream 2 global Interrupt */
+ DMA2_STR3_IRQn = 59, /*!< DMA2 Stream 3 global Interrupt */
+ DMA2_STR4_IRQn = 60, /*!< DMA2 Stream 4 global Interrupt */
+ ETH_IRQn = 61, /*!< Ethernet global Interrupt */
+ ETH_WKUP_IRQn = 62, /*!< Ethernet Wakeup through EINT line Interrupt */
+ CAN2_TX_IRQn = 63, /*!< CAN2 TX Interrupt */
+ CAN2_RX0_IRQn = 64, /*!< CAN2 RX0 Interrupt */
+ CAN2_RX1_IRQn = 65, /*!< CAN2 RX1 Interrupt */
+ CAN2_SCE_IRQn = 66, /*!< CAN2 SCE Interrupt */
+ OTG_FS_IRQn = 67, /*!< OTG_FS global Interrupt */
+ DMA2_STR5_IRQn = 68, /*!< DMA2 Stream 5 global interrupt */
+ DMA2_STR6_IRQn = 69, /*!< DMA2 Stream 6 global interrupt */
+ DMA2_STR7_IRQn = 70, /*!< DMA2 Stream 7 global interrupt */
+ USART6_IRQn = 71, /*!< USART6 global interrupt */
+ I2C3_EV_IRQn = 72, /*!< I2C3 event interrupt */
+ I2C3_ER_IRQn = 73, /*!< I2C3 error interrupt */
+ OTG_HS1_EP1_OUT_IRQn = 74, /*!< OTG_HS1 End Point 1 Out global interrupt */
+ OTG_HS1_EP1_IN_IRQn = 75, /*!< OTG_HS1 End Point 1 In global interrupt */
+ OTG_HS1_WKUP_IRQn = 76, /*!< OTG_HS1 Wakeup through EINT interrupt */
+ OTG_HS1_IRQn = 77, /*!< OTG_HS1 global interrupt */
+ DCI_IRQn = 78, /*!< DCMI global interrupt */
+ FPU_IRQn = 81, /*!< FPU global interrupt */
+ SM3_IRQn = 82, /*!< SM3 global interrupt */
+ SM4_IRQn = 83, /*!< SM4 global interrupt */
+ BN_IRQn = 84 /*!< BN global interrupt */
+#endif /* APM32F40x */
+
+#if defined(APM32F41X)
+ ADC_IRQn = 18, /*!< ADC Interrupt */
+ CAN1_TX_IRQn = 19, /*!< CAN1 TX Interrupt */
+ CAN1_RX0_IRQn = 20, /*!< CAN1 RX0 Interrupt */
+ CAN1_RX1_IRQn = 21, /*!< CAN1 RX1 Interrupt */
+ CAN1_SCE_IRQn = 22, /*!< CAN1 SCE Interrupt */
+ EINT9_5_IRQn = 23, /*!< External Line[9:5] Interrupts */
+ TMR1_BRK_TMR9_IRQn = 24, /*!< TMR1 Break interrupt and TMR9 global interrupt */
+ TMR1_UP_TMR10_IRQn = 25, /*!< TMR1 Update Interrupt and TMR10 global interrupt */
+ TMR1_TRG_COM_TMR11_IRQn = 26, /*!< TMR1 Trigger and Commutation Interrupt and TMR11 global interrupt */
+ TMR1_CC_IRQn = 27, /*!< TMR1 Capture Compare Interrupt */
+ TMR2_IRQn = 28, /*!< TMR2 global Interrupt */
+ TMR3_IRQn = 29, /*!< TMR3 global Interrupt */
+ TMR4_IRQn = 30, /*!< TMR4 global Interrupt */
+ I2C1_EV_IRQn = 31, /*!< I2C1 Event Interrupt */
+ I2C1_ER_IRQn = 32, /*!< I2C1 Error Interrupt */
+ I2C2_EV_IRQn = 33, /*!< I2C2 Event Interrupt */
+ I2C2_ER_IRQn = 34, /*!< I2C2 Error Interrupt */
+ SPI1_IRQn = 35, /*!< SPI1 global Interrupt */
+ SPI2_IRQn = 36, /*!< SPI2 global Interrupt */
+ USART1_IRQn = 37, /*!< USART1 global Interrupt */
+ USART2_IRQn = 38, /*!< USART2 global Interrupt */
+ USART3_IRQn = 39, /*!< USART3 global Interrupt */
+ EINT15_10_IRQn = 40, /*!< External Line[15:10] Interrupts */
+ RTC_Alarm_IRQn = 41, /*!< RTC Alarm (A and B) through EINT Line Interrupt */
+ OTG_FS_WKUP_IRQn = 42, /*!< OTG_FS Wakeup through EINT line interrupt */
+ TMR8_BRK_TMR12_IRQn = 43, /*!< TMR8 Break Interrupt and TMR12 global interrupt */
+ TMR8_UP_TMR13_IRQn = 44, /*!< TMR8 Update Interrupt and TMR13 global interrupt */
+ TMR8_TRG_COM_TMR14_IRQn = 45, /*!< TMR8 Trigger and Commutation Interrupt and TMR14 global interrupt */
+ TMR8_CC_IRQn = 46, /*!< TMR8 Capture Compare Interrupt */
+ DMA1_STR7_IRQn = 47, /*!< DMA1 Stream 7 Interrupt */
+ EMMC_IRQn = 48, /*!< FSMC global Interrupt */
+ SDIO_IRQn = 49, /*!< SDIO global Interrupt */
+ TMR5_IRQn = 50, /*!< TMR5 global Interrupt */
+ SPI3_IRQn = 51, /*!< SPI3 global Interrupt */
+ UART4_IRQn = 52, /*!< UART4 global Interrupt */
+ UART5_IRQn = 53, /*!< UART5 global Interrupt */
+ TMR6_DAC_IRQn = 54, /*!< TMR6 global and DAC1&2 underrun error interrupts */
+ TMR7_IRQn = 55, /*!< TMR7 global interrupt */
+ DMA2_STR0_IRQn = 56, /*!< DMA2 Stream 0 global Interrupt */
+ DMA2_STR1_IRQn = 57, /*!< DMA2 Stream 1 global Interrupt */
+ DMA2_STR2_IRQn = 58, /*!< DMA2 Stream 2 global Interrupt */
+ DMA2_STR3_IRQn = 59, /*!< DMA2 Stream 3 global Interrupt */
+ DMA2_STR4_IRQn = 60, /*!< DMA2 Stream 4 global Interrupt */
+ ETH_IRQn = 61, /*!< Ethernet global Interrupt */
+ ETH_WKUP_IRQn = 62, /*!< Ethernet Wakeup through EINT line Interrupt */
+ CAN2_TX_IRQn = 63, /*!< CAN2 TX Interrupt */
+ CAN2_RX0_IRQn = 64, /*!< CAN2 RX0 Interrupt */
+ CAN2_RX1_IRQn = 65, /*!< CAN2 RX1 Interrupt */
+ CAN2_SCE_IRQn = 66, /*!< CAN2 SCE Interrupt */
+ OTG_FS_IRQn = 67, /*!< OTG_FS global Interrupt */
+ DMA2_STR5_IRQn = 68, /*!< DMA2 Stream 5 global interrupt */
+ DMA2_STR6_IRQn = 69, /*!< DMA2 Stream 6 global interrupt */
+ DMA2_STR7_IRQn = 70, /*!< DMA2 Stream 7 global interrupt */
+ USART6_IRQn = 71, /*!< USART6 global interrupt */
+ I2C3_EV_IRQn = 72, /*!< I2C3 event interrupt */
+ I2C3_ER_IRQn = 73, /*!< I2C3 error interrupt */
+ OTG_HS1_EP1_OUT_IRQn = 74, /*!< OTG_HS1 End Point 1 Out global interrupt */
+ OTG_HS1_EP1_IN_IRQn = 75, /*!< OTG_HS1 End Point 1 In global interrupt */
+ OTG_HS1_WKUP_IRQn = 76, /*!< OTG_HS1 Wakeup through EINT interrupt */
+ OTG_HS1_IRQn = 77, /*!< OTG_HS1 global interrupt */
+ DCI_IRQn = 78, /*!< DCMI global interrupt */
+ CRYP_IRQn = 79, /*!< CRYP crypto global interrupt */
+ HASH_RNG_IRQn = 80, /*!< Hash and Rng global interrupt */
+ FPU_IRQn = 81, /*!< FPU global interrupt */
+ SM3_IRQn = 82, /*!< SM3 global interrupt */
+ SM4_IRQn = 83, /*!< SM4 global interrupt */
+ BN_IRQn = 84 /*!< BN global interrupt */
+#endif /* APM32F41x */
+
+} IRQn_Type;
+
+/**
+ * @}
+ */
+
+#include "core_cm4.h"
+#include "system_apm32f4xx.h"
+#include
+
+/** @defgroup Exported_Types
+ @{
+*/
+
+typedef int32_t s32;
+typedef int16_t s16;
+typedef int8_t s8;
+
+typedef const int32_t sc32;
+typedef const int16_t sc16;
+typedef const int8_t sc8;
+
+typedef __IO int32_t vs32;
+typedef __IO int16_t vs16;
+typedef __IO int8_t vs8;
+
+typedef __I int32_t vsc32;
+typedef __I int16_t vsc16;
+typedef __I int8_t vsc8;
+
+typedef uint32_t u32;
+typedef uint16_t u16;
+typedef uint8_t u8;
+
+typedef const uint32_t uc32;
+typedef const uint16_t uc16;
+typedef const uint8_t uc8;
+
+typedef __IO uint32_t vu32;
+typedef __IO uint16_t vu16;
+typedef __IO uint8_t vu8;
+
+typedef __I uint32_t vuc32;
+typedef __I uint16_t vuc16;
+typedef __I uint8_t vuc8;
+
+#ifndef __IM
+#define __IM __I
+#endif
+#ifndef __OM
+#define __OM __O
+#endif
+#ifndef __IOM
+#define __IOM __IO
+#endif
+
+enum {BIT_RESET, BIT_SET};
+enum {RESET, SET};
+enum {DISABLE, ENABLE};
+enum {ERROR, SUCCESS};
+
+#ifndef NULL
+#define NULL ((void *)0)
+#endif
+
+#if defined (__CC_ARM )
+#pragma anon_unions
+#endif
+
+/**
+ * @}
+ */
+
+/** @defgroup Peripheral_registers_structures
+ @{
+*/
+
+/**
+* @brief General purpose I/O (GPIO)
+*/
+typedef struct
+{
+ /** Port Mode configure register */
+ union
+ {
+ __IOM uint32_t MODE;
+
+ struct
+ {
+ __IOM uint32_t MODE0 : 2;
+ __IOM uint32_t MODE1 : 2;
+ __IOM uint32_t MODE2 : 2;
+ __IOM uint32_t MODE3 : 2;
+ __IOM uint32_t MODE4 : 2;
+ __IOM uint32_t MODE5 : 2;
+ __IOM uint32_t MODE6 : 2;
+ __IOM uint32_t MODE7 : 2;
+ __IOM uint32_t MODE8 : 2;
+ __IOM uint32_t MODE9 : 2;
+ __IOM uint32_t MODE10 : 2;
+ __IOM uint32_t MODE11 : 2;
+ __IOM uint32_t MODE12 : 2;
+ __IOM uint32_t MODE13 : 2;
+ __IOM uint32_t MODE14 : 2;
+ __IOM uint32_t MODE15 : 2;
+ } MODE_B;
+ } ;
+
+ /** Port Output Type configure register */
+ union
+ {
+ __IOM uint32_t OMODE;
+
+ struct
+ {
+ __IOM uint32_t OMODE0 : 1;
+ __IOM uint32_t OMODE1 : 1;
+ __IOM uint32_t OMODE2 : 1;
+ __IOM uint32_t OMODE3 : 1;
+ __IOM uint32_t OMODE4 : 1;
+ __IOM uint32_t OMODE5 : 1;
+ __IOM uint32_t OMODE6 : 1;
+ __IOM uint32_t OMODE7 : 1;
+ __IOM uint32_t OMODE8 : 1;
+ __IOM uint32_t OMODE9 : 1;
+ __IOM uint32_t OMODE10 : 1;
+ __IOM uint32_t OMODE11 : 1;
+ __IOM uint32_t OMODE12 : 1;
+ __IOM uint32_t OMODE13 : 1;
+ __IOM uint32_t OMODE14 : 1;
+ __IOM uint32_t OMODE15 : 1;
+ __IOM uint32_t RESERVED : 16;
+ } OMODE_B;
+ } ;
+
+ /** Port Output Speed configure register */
+ union
+ {
+ __IOM uint32_t OSSEL;
+
+ struct
+ {
+ __IOM uint32_t OSSEL0 : 2;
+ __IOM uint32_t OSSEL1 : 2;
+ __IOM uint32_t OSSEL2 : 2;
+ __IOM uint32_t OSSEL3 : 2;
+ __IOM uint32_t OSSEL4 : 2;
+ __IOM uint32_t OSSEL5 : 2;
+ __IOM uint32_t OSSEL6 : 2;
+ __IOM uint32_t OSSEL7 : 2;
+ __IOM uint32_t OSSEL8 : 2;
+ __IOM uint32_t OSSEL9 : 2;
+ __IOM uint32_t OSSEL10 : 2;
+ __IOM uint32_t OSSEL11 : 2;
+ __IOM uint32_t OSSEL12 : 2;
+ __IOM uint32_t OSSEL13 : 2;
+ __IOM uint32_t OSSEL14 : 2;
+ __IOM uint32_t OSSEL15 : 2;
+ } OSSEL_B;
+ } ;
+
+ /** port pull-up/pull-down register */
+ union
+ {
+ __IOM uint32_t PUPD;
+
+ struct
+ {
+ __IOM uint32_t PUPD0 : 2;
+ __IOM uint32_t PUPD1 : 2;
+ __IOM uint32_t PUPD2 : 2;
+ __IOM uint32_t PUPD3 : 2;
+ __IOM uint32_t PUPD4 : 2;
+ __IOM uint32_t PUPD5 : 2;
+ __IOM uint32_t PUPD6 : 2;
+ __IOM uint32_t PUPD7 : 2;
+ __IOM uint32_t PUPD8 : 2;
+ __IOM uint32_t PUPD9 : 2;
+ __IOM uint32_t PUPD10 : 2;
+ __IOM uint32_t PUPD11 : 2;
+ __IOM uint32_t PUPD12 : 2;
+ __IOM uint32_t PUPD13 : 2;
+ __IOM uint32_t PUPD14 : 2;
+ __IOM uint32_t PUPD15 : 2;
+ } PUPD_B;
+ } ;
+
+ /** Port data in register */
+ union
+ {
+ __IM uint32_t IDATA;
+
+ struct
+ {
+ __IM uint32_t IDATA0 : 1;
+ __IM uint32_t IDATA1 : 1;
+ __IM uint32_t IDATA2 : 1;
+ __IM uint32_t IDATA3 : 1;
+ __IM uint32_t IDATA4 : 1;
+ __IM uint32_t IDATA5 : 1;
+ __IM uint32_t IDATA6 : 1;
+ __IM uint32_t IDATA7 : 1;
+ __IM uint32_t IDATA8 : 1;
+ __IM uint32_t IDATA9 : 1;
+ __IM uint32_t IDATA10 : 1;
+ __IM uint32_t IDATA11 : 1;
+ __IM uint32_t IDATA12 : 1;
+ __IM uint32_t IDATA13 : 1;
+ __IM uint32_t IDATA14 : 1;
+ __IM uint32_t IDATA15 : 1;
+ __IM uint32_t RESERVED : 16;
+ } IDATA_B;
+ } ;
+
+ /** Port data output register */
+ union
+ {
+ __IOM uint32_t ODATA;
+
+ struct
+ {
+ __IOM uint32_t ODATA0 : 1;
+ __IOM uint32_t ODATA1 : 1;
+ __IOM uint32_t ODATA2 : 1;
+ __IOM uint32_t ODATA3 : 1;
+ __IOM uint32_t ODATA4 : 1;
+ __IOM uint32_t ODATA5 : 1;
+ __IOM uint32_t ODATA6 : 1;
+ __IOM uint32_t ODATA7 : 1;
+ __IOM uint32_t ODATA8 : 1;
+ __IOM uint32_t ODATA9 : 1;
+ __IOM uint32_t ODATA10 : 1;
+ __IOM uint32_t ODATA11 : 1;
+ __IOM uint32_t ODATA12 : 1;
+ __IOM uint32_t ODATA13 : 1;
+ __IOM uint32_t ODATA14 : 1;
+ __IOM uint32_t ODATA15 : 1;
+ __IM uint32_t RESERVED : 16;
+ } ODATA_B;
+ } ;
+
+ /**GPIO port bit set/clear register*/
+ union
+ {
+ __OM uint16_t BSCL;
+
+ struct
+ {
+ __OM uint16_t BS0 : 1;
+ __OM uint16_t BS1 : 1;
+ __OM uint16_t BS2 : 1;
+ __OM uint16_t BS3 : 1;
+ __OM uint16_t BS4 : 1;
+ __OM uint16_t BS5 : 1;
+ __OM uint16_t BS6 : 1;
+ __OM uint16_t BS7 : 1;
+ __OM uint16_t BS8 : 1;
+ __OM uint16_t BS9 : 1;
+ __OM uint16_t BS10 : 1;
+ __OM uint16_t BS11 : 1;
+ __OM uint16_t BS12 : 1;
+ __OM uint16_t BS13 : 1;
+ __OM uint16_t BS14 : 1;
+ __OM uint16_t BS15 : 1;
+ } BSCL_B;
+ } ;
+
+ union
+ {
+ __OM uint16_t BSCH;
+
+ struct
+ {
+ __OM uint16_t BC0 : 1;
+ __OM uint16_t BC1 : 1;
+ __OM uint16_t BC2 : 1;
+ __OM uint16_t BC3 : 1;
+ __OM uint16_t BC4 : 1;
+ __OM uint16_t BC5 : 1;
+ __OM uint16_t BC6 : 1;
+ __OM uint16_t BC7 : 1;
+ __OM uint16_t BC8 : 1;
+ __OM uint16_t BC9 : 1;
+ __OM uint16_t BC10 : 1;
+ __OM uint16_t BC11 : 1;
+ __OM uint16_t BC12 : 1;
+ __OM uint16_t BC13 : 1;
+ __OM uint16_t BC14 : 1;
+ __OM uint16_t BC15 : 1;
+ } BSCH_B;
+ };
+
+ /** Port configuration lock register */
+ union
+ {
+ __IOM uint32_t LOCK;
+
+ struct
+ {
+ __IOM uint32_t LOCK0 : 1;
+ __IOM uint32_t LOCK1 : 1;
+ __IOM uint32_t LOCK2 : 1;
+ __IOM uint32_t LOCK3 : 1;
+ __IOM uint32_t LOCK4 : 1;
+ __IOM uint32_t LOCK5 : 1;
+ __IOM uint32_t LOCK6 : 1;
+ __IOM uint32_t LOCK7 : 1;
+ __IOM uint32_t LOCK8 : 1;
+ __IOM uint32_t LOCK9 : 1;
+ __IOM uint32_t LOCK10 : 1;
+ __IOM uint32_t LOCK11 : 1;
+ __IOM uint32_t LOCK12 : 1;
+ __IOM uint32_t LOCK13 : 1;
+ __IOM uint32_t LOCK14 : 1;
+ __IOM uint32_t LOCK15 : 1;
+ __IOM uint32_t LOCKKEY : 1;
+ __IM uint32_t RESERVED : 15;
+ } LOCK_B;
+ };
+
+ /** Port Alternate Function Low register */
+ union
+ {
+ __IOM uint32_t ALFL;
+
+ struct
+ {
+ __IOM uint32_t ALFSEL0 : 4;
+ __IOM uint32_t ALFSEL1 : 4;
+ __IOM uint32_t ALFSEL2 : 4;
+ __IOM uint32_t ALFSEL3 : 4;
+ __IOM uint32_t ALFSEL4 : 4;
+ __IOM uint32_t ALFSEL5 : 4;
+ __IOM uint32_t ALFSEL6 : 4;
+ __IOM uint32_t ALFSEL7 : 4;
+ } ALFL_B;
+ };
+
+ /** Port alternate function High register */
+ union
+ {
+ __IOM uint32_t ALFH;
+
+ struct
+ {
+ __IOM uint32_t ALFSEL8 : 4;
+ __IOM uint32_t ALFSEL9 : 4;
+ __IOM uint32_t ALFSEL10 : 4;
+ __IOM uint32_t ALFSEL11 : 4;
+ __IOM uint32_t ALFSEL12 : 4;
+ __IOM uint32_t ALFSEL13 : 4;
+ __IOM uint32_t ALFSEL14 : 4;
+ __IOM uint32_t ALFSEL15 : 4;
+ } ALFH_B;
+ };
+} GPIO_T;
+
+/**
+ * @brief Reset and clock management unit (RCM)
+ */
+typedef struct
+{
+ /** Clock control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t HSIEN : 1;
+ __IM uint32_t HSIRDYFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t HSITRIM : 5;
+ __IM uint32_t HSICAL : 8;
+ __IOM uint32_t HSEEN : 1;
+ __IM uint32_t HSERDYFLG : 1;
+ __IOM uint32_t HSEBCFG : 1;
+ __IOM uint32_t CSSEN : 1;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t PLL1EN : 1;
+ __IM uint32_t PLL1RDYFLG : 1;
+ __IOM uint32_t PLL2EN : 1;
+ __IM uint32_t PLL2RDYFLG : 1;
+ __IM uint32_t RESERVED3 : 4;
+ } CTRL_B;
+ };
+
+ /** PLL1 configuration register */
+ union
+ {
+ __IOM uint32_t PLL1CFG;
+
+ struct
+ {
+ __IOM uint32_t PLLB : 6;
+ __IOM uint32_t PLL1A : 9;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t PLL1C : 2;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t PLL1CLKS : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t PLLD : 4;
+ __IM uint32_t RESERVED4 : 4;
+ } PLL1CFG_B;
+ } ;
+
+ /** Clock configuration register */
+ union
+ {
+ __IOM uint32_t CFG;
+
+ struct
+ {
+ __IOM uint32_t SCLKSEL : 2;
+ __IM uint32_t SCLKSWSTS : 2;
+ __IOM uint32_t AHBPSC : 4;
+ __IOM uint32_t SDRAMPSC : 2;
+ __IOM uint32_t APB1PSC : 3;
+ __IOM uint32_t APB2PSC : 3;
+ __IOM uint32_t RTCPSC : 5;
+ __IOM uint32_t MCO1SEL : 2;
+ __IOM uint32_t I2SSEL : 1;
+ __IOM uint32_t MCO1PRE : 3;
+ __IOM uint32_t MCO2PRE : 3;
+ __IOM uint32_t MCO2SEL : 2;
+ } CFG_B;
+ } ;
+
+ /** Clock interrupt control register */
+ union
+ {
+ __IOM uint32_t INT;
+
+ struct
+ {
+ __IM uint32_t LSIRDYFLG : 1;
+ __IM uint32_t LSERDYFLG : 1;
+ __IM uint32_t HSIRDYFLG : 1;
+ __IM uint32_t HSERDYFLG : 1;
+ __IM uint32_t PLL1RDYFLG : 1;
+ __IM uint32_t PLL2RDYFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t CSSFLG : 1;
+ __IOM uint32_t LSIRDYEN : 1;
+ __IOM uint32_t LSERDYEN : 1;
+ __IOM uint32_t HSIRDYEN : 1;
+ __IOM uint32_t HSERDYEN : 1;
+ __IOM uint32_t PLL1RDYEN : 1;
+ __IOM uint32_t PLL2RDYEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __OM uint32_t LSIRDYCLR : 1;
+ __OM uint32_t LSERDYCLR : 1;
+ __OM uint32_t HSIRDYCLR : 1;
+ __OM uint32_t HSERDYCLR : 1;
+ __OM uint32_t PLL1RDYCLR : 1;
+ __OM uint32_t PLL2RDYCLR : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __OM uint32_t CSSCLR : 1;
+ __IM uint32_t RESERVED4 : 8;
+ } INT_B;
+ } ;
+
+ /** AHB1 peripheral reset register */
+ union
+ {
+ __IOM uint32_t AHB1RST;
+
+ struct
+ {
+ __IOM uint32_t PARST : 1;
+ __IOM uint32_t PBRST : 1;
+ __IOM uint32_t PCRST : 1;
+ __IOM uint32_t PDRST : 1;
+ __IOM uint32_t PERST : 1;
+ __IOM uint32_t PFRST : 1;
+ __IOM uint32_t PGRST : 1;
+ __IOM uint32_t PHRST : 1;
+ __IOM uint32_t PIRST : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRCRST : 1;
+ __IM uint32_t RESERVED2 : 8;
+ __IOM uint32_t DMA1RST : 1;
+ __IOM uint32_t DMA2RST : 1;
+ __IM uint32_t RESERVED3 : 2;
+ __IOM uint32_t ETHRST : 1;
+ __IM uint32_t RESERVED4 : 3;
+ __IOM uint32_t OTG_HS1RST : 1;
+ __IM uint32_t RESERVED5 : 2;
+ } AHB1RST_B;
+ } ;
+
+ /** AHB2 peripheral reset register */
+ union
+ {
+ __IOM uint32_t AHB2RST;
+
+ struct
+ {
+ __IOM uint32_t DCIRST : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRYPRST : 1;
+ __IOM uint32_t HASHP : 1;
+ __IOM uint32_t RNGRST : 1;
+ __IOM uint32_t OTG_FSRST : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } AHB2RST_B;
+ } ;
+
+ /** AHB3 peripheral reset register */
+ union
+ {
+ __IOM uint32_t AHB3RST;
+
+ struct
+ {
+ __IOM uint32_t EMMCRST : 1;
+ __IM uint32_t RESERVED : 31;
+ } AHB3RST_B;
+ } ;
+
+ __IM uint32_t RESERVED;
+
+ /** APB1 peripheral reset register */
+ union
+ {
+ __IOM uint32_t APB1RST;
+
+ struct
+ {
+ __IOM uint32_t TMR2RST : 1;
+ __IOM uint32_t TMR3RST : 1;
+ __IOM uint32_t TMR4RST : 1;
+ __IOM uint32_t TMR5RST : 1;
+ __IOM uint32_t TMR6RST : 1;
+ __IOM uint32_t TMR7RST : 1;
+ __IOM uint32_t TMR12RST : 1;
+ __IOM uint32_t TMR13RST : 1;
+ __IOM uint32_t TMR14RST : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t WWDTRST : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t SPI2RST : 1;
+ __IOM uint32_t SPI3RST : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t USART2RST : 1;
+ __IOM uint32_t USART3RST : 1;
+ __IOM uint32_t UART4RST : 1;
+ __IOM uint32_t UART5RST : 1;
+ __IOM uint32_t I2C1RST : 1;
+ __IOM uint32_t I2C2RST : 1;
+ __IOM uint32_t I2C3RST : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t CAN1RST : 1;
+ __IM uint32_t CAN2RST : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IOM uint32_t PMURST : 1;
+ __IOM uint32_t DACRST : 1;
+ __IM uint32_t RESERVED6 : 2;
+ } APB1RST_B;
+ } ;
+
+ /** APB2 peripheral reset register */
+ union
+ {
+ __IOM uint32_t APB2RST;
+
+ struct
+ {
+ __IOM uint32_t TMR1RST : 1;
+ __IOM uint32_t TMR8RST : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t USART1RST : 1;
+ __IOM uint32_t USART6RST : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t ADCRST : 1;
+ __IM uint32_t RESERVED3 : 2;
+ __IOM uint32_t SDIORST : 1;
+ __IOM uint32_t SPI1RST : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t SYSCFGRST : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IOM uint32_t TMR9RST : 1;
+ __IOM uint32_t TMR10RST : 1;
+ __IOM uint32_t TMR11RST : 1;
+ __IM uint32_t RESERVED6 : 13;
+ } APB2RST_B;
+ } ;
+
+ __IM uint32_t RESERVED1[2];
+
+ /** AHB1 clock enable register */
+ union
+ {
+ __IOM uint32_t AHB1CLKEN;
+
+ struct
+ {
+ __IOM uint32_t PAEN : 1;
+ __IOM uint32_t PBEN : 1;
+ __IOM uint32_t PCEN : 1;
+ __IOM uint32_t PDEN : 1;
+ __IOM uint32_t PEEN : 1;
+ __IOM uint32_t PFEN : 1;
+ __IOM uint32_t PGEN : 1;
+ __IOM uint32_t PHEN : 1;
+ __IOM uint32_t PIEN : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRCEN : 1;
+ __IM uint32_t RESERVED2 : 5;
+ __IOM uint32_t BAKPSRAMEN : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t DRAMEN : 1;
+ __IOM uint32_t DMA1EN : 1;
+ __IOM uint32_t DMA2EN : 1;
+ __IM uint32_t RESERVED4 : 2;
+ __IOM uint32_t ETHEN : 1;
+ __IOM uint32_t ETHTXEN : 1;
+ __IOM uint32_t ETHRXEN : 1;
+ __IOM uint32_t ETHPTPEN : 1;
+ __IOM uint32_t OTG_HS1EN : 1;
+ __IOM uint32_t HSULPIEN : 1;
+ __IM uint32_t RESERVED5 : 1;
+ } AHB1CLKEN_B;
+ } ;
+
+ /** AHB2 clock enable register */
+ union
+ {
+ __IOM uint32_t AHB2CLKEN;
+
+ struct
+ {
+ __IOM uint32_t DCIEN : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRYPEN : 1;
+ __IOM uint32_t HASHP : 1;
+ __IOM uint32_t RNGEN : 1;
+ __IOM uint32_t OTG_FSEN : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } AHB2CLKEN_B;
+ };
+
+ /** AHB3 peripheral enable register */
+ union
+ {
+ __IOM uint32_t AHB3CLKEN;
+
+ struct
+ {
+ __IOM uint32_t EMMCEN : 1;
+ __IM uint32_t RESERVED : 31;
+ } AHB3CLKEN_B;
+ } ;
+
+ __IM uint32_t RESERVED2;
+
+ /** APB1 peripheral enable register */
+ union
+ {
+ __IOM uint32_t APB1CLKEN;
+
+ struct
+ {
+ __IOM uint32_t TMR2EN : 1;
+ __IOM uint32_t TMR3EN : 1;
+ __IOM uint32_t TMR4EN : 1;
+ __IOM uint32_t TMR5EN : 1;
+ __IOM uint32_t TMR6EN : 1;
+ __IOM uint32_t TMR7EN : 1;
+ __IOM uint32_t TMR12EN : 1;
+ __IOM uint32_t TMR13EN : 1;
+ __IOM uint32_t TMR14EN : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t WWDTEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t SPI2EN : 1;
+ __IOM uint32_t SPI3EN : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t USART2EN : 1;
+ __IOM uint32_t USART3EN : 1;
+ __IOM uint32_t UART4EN : 1;
+ __IOM uint32_t UART5EN : 1;
+ __IOM uint32_t I2C1EN : 1;
+ __IOM uint32_t I2C2EN : 1;
+ __IOM uint32_t I2C3EN : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t CAN1EN : 1;
+ __IM uint32_t CAN2EN : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IOM uint32_t PMUEN : 1;
+ __IOM uint32_t DACEN : 1;
+ __IM uint32_t RESERVED6 : 2;
+ } APB1CLKEN_B;
+ } ;
+
+ /** APB2 peripheral enable register */
+ union
+ {
+ __IOM uint32_t APB2CLKEN;
+
+ struct
+ {
+ __IOM uint32_t TMR1EN : 1;
+ __IOM uint32_t TMR8EN : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t USART1EN : 1;
+ __IOM uint32_t USART6EN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t ADC1EN : 1;
+ __IOM uint32_t ADC2EN : 1;
+ __IOM uint32_t ADC3EN : 1;
+ __IOM uint32_t SDIOEN : 1;
+ __IOM uint32_t SPI1EN : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t SYSCFGEN : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t TMR9EN : 1;
+ __IOM uint32_t TMR10EN : 1;
+ __IOM uint32_t TMR11EN : 1;
+ __IM uint32_t RESERVED5 : 13;
+ } APB2CLKEN_B;
+ } ;
+
+ __IM uint32_t RESERVED3[2];
+
+ /** AHB1 clock enable register during lowpower mode */
+ union
+ {
+ __IOM uint32_t LPAHB1CLKEN;
+
+ struct
+ {
+ __IOM uint32_t PAEN : 1;
+ __IOM uint32_t PBEN : 1;
+ __IOM uint32_t PCEN : 1;
+ __IOM uint32_t PDEN : 1;
+ __IOM uint32_t PEEN : 1;
+ __IOM uint32_t PFEN : 1;
+ __IOM uint32_t PGEN : 1;
+ __IOM uint32_t PHEN : 1;
+ __IOM uint32_t PIEN : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRCEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t FMCEN : 1;
+ __IOM uint32_t SRAM1EN : 1;
+ __IOM uint32_t SRAM2EN : 1;
+ __IOM uint32_t BAKPSRAMEN : 1;
+ __IM uint32_t RESERVED3 : 2;
+ __IOM uint32_t DMA1EN : 1;
+ __IOM uint32_t DMA2EN : 1;
+ __IM uint32_t RESERVED4 : 2;
+ __IOM uint32_t ETHEN : 1;
+ __IOM uint32_t ETHTXEN : 1;
+ __IOM uint32_t ETHRXEN : 1;
+ __IOM uint32_t ETHPTPEN : 1;
+ __IOM uint32_t OTG_HS1EN : 1;
+ __IOM uint32_t HSULPIEN : 1;
+ __IM uint32_t RESERVED5 : 1;
+ } LPAHB1CLKEN_B;
+ } ;
+
+ /** AHB2 clock enable register during lowpower mode */
+ union
+ {
+ __IOM uint32_t LPAHB2CLKEN;
+
+ struct
+ {
+ __IOM uint32_t DCIEN : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t CRYPEN : 1;
+ __IOM uint32_t HASHPEN : 1;
+ __IOM uint32_t RNGEN : 1;
+ __IOM uint32_t OTG_FSEN : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } LPAHB2CLKEN_B;
+ };
+
+ /** AHB3 peripheral enable register during lowpower mode */
+ union
+ {
+ __IOM uint32_t LPAHB3CLKEN;
+
+ struct
+ {
+ __IOM uint32_t EMMCEN : 1;
+ __IM uint32_t RESERVED : 31;
+ } LPAHB3CLKEN_B;
+ } ;
+
+ __IM uint32_t RESERVED4;
+
+ /** APB1 peripheral enable register during lowpower mode */
+ union
+ {
+ __IOM uint32_t LPAPB1CLKEN;
+
+ struct
+ {
+ __IOM uint32_t TMR2EN : 1;
+ __IOM uint32_t TMR3EN : 1;
+ __IOM uint32_t TMR4EN : 1;
+ __IOM uint32_t TMR5EN : 1;
+ __IOM uint32_t TMR6EN : 1;
+ __IOM uint32_t TMR7EN : 1;
+ __IOM uint32_t TMR12EN : 1;
+ __IOM uint32_t TMR13EN : 1;
+ __IOM uint32_t TMR14EN : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t WWDTEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t SPI2EN : 1;
+ __IOM uint32_t SPI3EN : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t USART2EN : 1;
+ __IOM uint32_t USART3EN : 1;
+ __IOM uint32_t UART4EN : 1;
+ __IOM uint32_t UART5EN : 1;
+ __IOM uint32_t I2C1EN : 1;
+ __IOM uint32_t I2C2EN : 1;
+ __IOM uint32_t I2C3EN : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t CAN1EN : 1;
+ __IM uint32_t CAN2EN : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IOM uint32_t PMUEN : 1;
+ __IOM uint32_t DACEN : 1;
+ __IM uint32_t RESERVED6 : 2;
+ } LPAPB1CLKEN_B;
+ } ;
+
+ /** APB2 peripheral enable register during lowpower mode */
+ union
+ {
+ __IOM uint32_t LPAPB2CLKEN;
+
+ struct
+ {
+ __IOM uint32_t TMR1EN : 1;
+ __IOM uint32_t TMR8EN : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t USART1EN : 1;
+ __IOM uint32_t USART6EN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t ADC1EN : 1;
+ __IOM uint32_t ADC2EN : 1;
+ __IOM uint32_t ADC3EN : 1;
+ __IOM uint32_t SDIOEN : 1;
+ __IOM uint32_t SPI1EN : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t SYSCFGEN : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IOM uint32_t TMR9EN : 1;
+ __IOM uint32_t TMR10EN : 1;
+ __IOM uint32_t TMR11EN : 1;
+ __IM uint32_t RESERVED5 : 13;
+ } LPAPB2CLKEN_B;
+ } ;
+
+ __IM uint32_t RESERVED5[2];
+
+ /** Backup domain control register */
+ union
+ {
+ __IOM uint32_t BDCTRL;
+
+ struct
+ {
+ __IOM uint32_t LSEEN : 1;
+ __IM uint32_t LSERDYFLG : 1;
+ __IOM uint32_t LSEBCFG : 1;
+ __IM uint32_t RESERVED1 : 5;
+ __IOM uint32_t RTCSRCSEL : 2;
+ __IM uint32_t RESERVED2 : 5;
+ __IOM uint32_t RTCCLKEN : 1;
+ __IOM uint32_t BDRST : 1;
+ __IM uint32_t RESERVED3 : 15;
+ } BDCTRL_B;
+ } ;
+
+ /** Control/status register */
+ union
+ {
+ __IOM uint32_t CSTS;
+
+ struct
+ {
+ __IOM uint32_t LSIEN : 1;
+ __IM uint32_t LSIRDYFLG : 1;
+ __IM uint32_t RESERVED : 22;
+ __IOM uint32_t RSTFLGCLR : 1;
+ __IM uint32_t BORRSTFLG : 1;
+ __IM uint32_t PINRSTFLG : 1;
+ __IM uint32_t PODRSTFLG : 1;
+ __IM uint32_t SWRSTFLG : 1;
+ __IM uint32_t IWDTRSTFLG : 1;
+ __IM uint32_t WWDTRSTFLG : 1;
+ __IM uint32_t LPWRRSTFLG : 1;
+ } CSTS_B;
+ } ;
+
+ __IM uint32_t RESERVED6[2];
+
+ /** spread spectrum clock generation register */
+ union
+ {
+ __IOM uint32_t SSCCFG;
+
+ struct
+ {
+ __IOM uint32_t MODPCFG : 13;
+ __IOM uint32_t STEP : 15;
+ __IM uint32_t RESERVED : 2;
+ __IOM uint32_t SSSEL : 1;
+ __IM uint32_t SSEN : 1;
+ } SSCCFG_B;
+ } ;
+
+ /** PLL2 configuration register */
+ union
+ {
+ __IOM uint32_t PLL2CFG;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 6;
+ __IOM uint32_t PLL2A : 9;
+ __IM uint32_t RESERVED2 : 13;
+ __IOM uint32_t PLL2C : 3;
+ __IM uint32_t RESERVED3 : 1;
+ } PLL2CFG_B;
+ } ;
+} RCM_T;
+
+/**
+ * @brief Universal synchronous asynchronous receiver transmitter (USART)
+ */
+typedef struct
+{
+ /** Status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t PEFLG : 1;
+ __IM uint32_t FEFLG : 1;
+ __IM uint32_t NEFLG : 1;
+ __IM uint32_t OVREFLG : 1;
+ __IM uint32_t IDLEFLG : 1;
+ __IOM uint32_t RXBNEFLG : 1;
+ __IOM uint32_t TXCFLG : 1;
+ __IM uint32_t TXBEFLG : 1;
+ __IOM uint32_t LBDFLG : 1;
+ __IOM uint32_t CTSFLG : 1;
+ __IM uint32_t RESERVED : 22;
+ } STS_B;
+ } ;
+
+ /** TX Buffer Data Register */
+ union
+ {
+ __IOM uint32_t DATA;
+
+ struct
+ {
+ __IOM uint32_t DATA : 9;
+ __IM uint32_t RESERVED : 23;
+ } DATA_B;
+ } ;
+
+ /** Baud rate register */
+ union
+ {
+ __IOM uint32_t BR;
+
+ struct
+ {
+ __IOM uint32_t FBR : 4;
+ __IOM uint32_t IBR : 12;
+ __IM uint32_t RESERVED : 16;
+ } BR_B;
+ } ;
+
+ /** Control register 1 */
+ union
+ {
+ __IOM uint32_t CTRL1;
+
+ struct
+ {
+ __IOM uint32_t TXBF : 1;
+ __IOM uint32_t RXMUTEEN : 1;
+ __IOM uint32_t RXEN : 1;
+ __IOM uint32_t TXEN : 1;
+ __IOM uint32_t IDLEIEN : 1;
+ __IOM uint32_t RXBNEIEN : 1;
+ __IOM uint32_t TXCIEN : 1;
+ __IOM uint32_t TXBEIEN : 1;
+ __IOM uint32_t PEIEN : 1;
+ __IOM uint32_t PCFG : 1;
+ __IOM uint32_t PCEN : 1;
+ __IOM uint32_t WUPMCFG : 1;
+ __IOM uint32_t DBLCFG : 1;
+ __IOM uint32_t UEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t OSMCFG : 1;
+ __IM uint32_t RESERVED2 : 16;
+ } CTRL1_B;
+ } ;
+
+ /** Control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IOM uint32_t ADDR : 4;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t LBDLCFG : 1;
+ __IOM uint32_t LBDIEN : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t LBCPOEN : 1;
+ __IOM uint32_t CPHA : 1;
+ __IOM uint32_t CPOL : 1;
+ __IOM uint32_t CLKEN : 1;
+ __IOM uint32_t STOPCFG : 2;
+ __IOM uint32_t LINMEN : 1;
+ __IM uint32_t RESERVED3 : 17;
+ } CTRL2_B;
+ } ;
+
+ /** Control register 3 */
+ union
+ {
+ __IOM uint32_t CTRL3;
+
+ struct
+ {
+ __IOM uint32_t ERRIEN : 1;
+ __IOM uint32_t IREN : 1;
+ __IOM uint32_t IRLPEN : 1;
+ __IOM uint32_t HDEN : 1;
+ __IOM uint32_t SCNACKEN : 1;
+ __IOM uint32_t SCEN : 1;
+ __IOM uint32_t DMARXEN : 1;
+ __IOM uint32_t DMATXEN : 1;
+ __IOM uint32_t RTSEN : 1;
+ __IOM uint32_t CTSEN : 1;
+ __IOM uint32_t CTSIEN : 1;
+ __IOM uint32_t SAMCFG : 1;
+ __IM uint32_t RESERVED : 20;
+ } CTRL3_B;
+ } ;
+
+ /** Guard TMRe and divider number register */
+ union
+ {
+ __IOM uint32_t GTPSC;
+
+ struct
+ {
+ __IOM uint32_t PSC : 8;
+ __IOM uint32_t GRDT : 8;
+ __IM uint32_t RESERVED : 16;
+ } GTPSC_B;
+ } ;
+} USART_T;
+
+/**
+ * @brief FLASH Registers (FMC)
+ */
+
+typedef struct
+{
+
+ /** Flash access control register */
+ union
+ {
+ __IOM uint32_t ACCTRL;
+
+ struct
+ {
+ __IOM uint32_t WAITP : 3;
+ __IM uint32_t RESERVED1 : 5;
+ __IOM uint32_t PREFEN : 1;
+ __IOM uint32_t ICACHEEN : 1;
+ __IOM uint32_t DCACHEEN : 1;
+ __OM uint32_t ICACHERST : 1;
+ __IOM uint32_t DCACHERST : 1;
+ __IM uint32_t RESERVED2 : 19;
+ } ACCTRL_B;
+ } ;
+
+ /** Flash key register */
+ union
+ {
+ __OM uint32_t KEY;
+
+ struct
+ {
+ __OM uint32_t KEY : 32;
+ } KEY_B;
+ } ;
+
+ /** Flash option key register */
+ union
+ {
+ __OM uint32_t OPTKEY;
+
+ struct
+ {
+ __OM uint32_t OPTKEY : 32;
+ } OPTKEY_B;
+ } ;
+
+ /** Flash status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t OPRCMP : 1;
+ __IOM uint32_t OPRERR : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t WPROTERR : 1;
+ __IOM uint32_t PGALGERR : 1;
+ __IOM uint32_t PGPRLERR : 1;
+ __IOM uint32_t PGSEQERR : 1;
+ __IM uint32_t RESERVED2 : 8;
+ __IM uint32_t BUSY : 1;
+ __IM uint32_t RESERVED3 : 15;
+ } STS_B;
+ } ;
+
+ /** Flash control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t PG : 1;
+ __IOM uint32_t SERS : 1;
+ __IOM uint32_t MERS : 1;
+ __IOM uint32_t SNUM : 4;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t PGSIZE : 2;
+ __IM uint32_t RESERVED2 : 6;
+ __IOM uint32_t START : 1;
+ __IM uint32_t RESERVED3 : 7;
+ __IOM uint32_t OPCINTEN : 1;
+ __IOM uint32_t ERRINTEN : 1;
+ __IM uint32_t RESERVED4 : 5;
+ __IOM uint32_t LOCK : 1;
+ } CTRL_B;
+ } ;
+
+ /** Option byte register */
+ union
+ {
+ __IOM uint32_t OPTCTRL;
+
+ struct
+ {
+ __IOM uint32_t OPTLOCK : 1;
+ __IOM uint32_t OPTSTART : 1;
+ __IOM uint32_t BORLVL : 2;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t WDTSEL : 1;
+ __IOM uint32_t RSTSTOP : 1;
+ __IOM uint32_t RSTSTDB : 1;
+ __IOM uint32_t RPROT : 8;
+ __IOM uint32_t NWPROT : 12;
+ __IM uint32_t RESERVED2 : 4;
+ } OPTCTRL_B;
+ } ;
+} FMC_T;
+
+/**
+ * @brief CRC calculation unit (CRC)
+ */
+typedef struct
+{
+ /** @brief DATA register */
+ union
+ {
+ __IOM uint32_t DATA;
+
+ struct
+ {
+ __IOM uint32_t DATA : 32;
+ } DATA_B;
+ } ;
+
+ /** @brief independent DATA register */
+ union
+ {
+ __IOM uint32_t INDATA;
+
+ struct
+ {
+ __IOM uint32_t INDATA : 8;
+ __IM uint32_t RESERVED : 24;
+ } INDATA_B;
+ };
+
+ /** @brief Countrol register */
+ union
+ {
+ __OM uint32_t CTRL;
+
+ struct
+ {
+ __OM uint32_t RST : 1;
+ __IM uint32_t RESERVED : 31;
+ } CTRL_B;
+ };
+} CRC_T;
+
+/**
+ * @brief Real-time clock (RTC)
+ */
+typedef struct
+{
+
+ /** time register */
+ union
+ {
+ __IOM uint32_t TIME;
+
+ struct
+ {
+ __IOM uint32_t SECU : 4;
+ __IOM uint32_t SECT : 3;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t MINU : 4;
+ __IOM uint32_t MINT : 3;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t HRU : 4;
+ __IOM uint32_t HRT : 2;
+ __IOM uint32_t TIMEFCFG : 1;
+ __IM uint32_t RESERVED3 : 9;
+ } TIME_B;
+ } ;
+
+ /** date register */
+ union
+ {
+ __IOM uint32_t DATE;
+
+ struct
+ {
+ __IOM uint32_t DAYU : 4;
+ __IOM uint32_t DAYT : 2;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t MONU : 4;
+ __IOM uint32_t MONT : 1;
+ __IOM uint32_t WEEKSEL : 3;
+ __IOM uint32_t YRU : 4;
+ __IOM uint32_t YRT : 4;
+ __IM uint32_t RESERVED2 : 8;
+ } DATE_B;
+ } ;
+
+ /** control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t WUCLKSEL : 3;
+ __IOM uint32_t TSETECFG : 1;
+ __IOM uint32_t RCLKDEN : 1;
+ __IOM uint32_t RCMCFG : 1;
+ __IOM uint32_t TIMEFCFG : 1;
+ __IOM uint32_t DCALEN : 1;
+ __IOM uint32_t ALRAEN : 1;
+ __IOM uint32_t ALRBEN : 1;
+ __IOM uint32_t WUTEN : 1;
+ __IOM uint32_t TSEN : 1;
+ __IOM uint32_t ALRAIEN : 1;
+ __IOM uint32_t ALRBIEN : 1;
+ __IOM uint32_t WUTIEN : 1;
+ __IOM uint32_t TSIEN : 1;
+ __IOM uint32_t STCCFG : 1;
+ __IOM uint32_t WTCCFG : 1;
+ __IOM uint32_t BAKP : 1;
+ __IOM uint32_t CALOSEL : 1;
+ __IOM uint32_t POLCFG : 1;
+ __IOM uint32_t OUTSEL : 2;
+ __IOM uint32_t CALOEN : 1;
+ __IM uint32_t RESERVED1 : 8;
+ } CTRL_B;
+ } ;
+
+ /** initialization and status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t ALRAWFLG : 1;
+ __IM uint32_t ALRBWFLG : 1;
+ __IOM uint32_t WUTWFLG : 1;
+ __IOM uint32_t SOPFLG : 1;
+ __IM uint32_t INITSFLG : 1;
+ __IOM uint32_t RSFLG : 1;
+ __IM uint32_t RINITFLG : 1;
+ __IOM uint32_t INITEN : 1;
+ __IOM uint32_t ALRAFLG : 1;
+ __IOM uint32_t ALRBFLG : 1;
+ __IOM uint32_t WUTFLG : 1;
+ __IOM uint32_t TSFLG : 1;
+ __IOM uint32_t TSOVRFLG : 1;
+ __IOM uint32_t TP1FLG : 1;
+ __IOM uint32_t TP2FLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t RCALPFLG : 1;
+ __IM uint32_t RESERVED2 : 15;
+ } STS_B;
+ } ;
+
+ /** prescaler register */
+ union
+ {
+ __IOM uint32_t PSC;
+
+ struct
+ {
+ __IOM uint32_t SPSC : 15;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t APSC : 7;
+ __IM uint32_t RESERVED2 : 9;
+ } PSC_B;
+ } ;
+
+ /** auto-reload register */
+ union
+ {
+ __IOM uint32_t AUTORLD;
+
+ struct
+ {
+ __IOM uint32_t WUAUTORE : 16;
+ __IM uint32_t RESERVED : 16;
+ } AUTORLD_B;
+ } ;
+
+ /** calibration register */
+ union
+ {
+ __IOM uint32_t DCAL;
+
+ struct
+ {
+ __IOM uint32_t DCAL : 5;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t DCALCFG : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } DCAL_B;
+ } ;
+
+ /** alarm A register */
+ union
+ {
+ __IOM uint32_t ALRMA;
+
+ struct
+ {
+ __IOM uint32_t SECU : 4;
+ __IOM uint32_t SECT : 3;
+ __IOM uint32_t SECMEN : 1;
+ __IOM uint32_t MINU : 4;
+ __IOM uint32_t MINT : 3;
+ __IOM uint32_t MINMEN : 1;
+ __IOM uint32_t HRU : 4;
+ __IOM uint32_t HRT : 2;
+ __IOM uint32_t TIMEFCFG : 1;
+ __IOM uint32_t HRMEN : 1;
+ __IOM uint32_t DAYU : 4;
+ __IOM uint32_t DAYT : 2;
+ __IOM uint32_t WEEKSEL : 1;
+ __IOM uint32_t DATEMEN : 1;
+ } ALRMA_B;
+ } ;
+
+ /** alarm B register */
+ union
+ {
+ __IOM uint32_t ALRMB;
+
+ struct
+ {
+ __IOM uint32_t SECU : 4;
+ __IOM uint32_t SECT : 3;
+ __IOM uint32_t SECMEN : 1;
+ __IOM uint32_t MINU : 4;
+ __IOM uint32_t MINT : 3;
+ __IOM uint32_t MINMEN : 1;
+ __IOM uint32_t HRU : 4;
+ __IOM uint32_t HRT : 2;
+ __IOM uint32_t TIMEFCFG : 1;
+ __IOM uint32_t HRMEN : 1;
+ __IOM uint32_t DAYU : 4;
+ __IOM uint32_t DAYT : 2;
+ __IOM uint32_t WEEKSEL : 1;
+ __IOM uint32_t DATEMEN : 1;
+ } ALRMB_B;
+ } ;
+
+ /** write protection register */
+ union
+ {
+ __OM uint32_t WRPROT;
+
+ struct
+ {
+ __OM uint32_t KEY : 16;
+ __IM uint32_t RESERVED : 16;
+ } WRPROT_B;
+ } ;
+
+ /** sub second register */
+ union
+ {
+ __IM uint32_t SUBSEC;
+
+ struct
+ {
+ __IM uint32_t SUBSEC : 16;
+ __IM uint32_t RESERVED : 16;
+ } SUBSEC_B;
+ } ;
+
+ /** shift control register */
+ union
+ {
+ __OM uint32_t SHIFT;
+
+ struct
+ {
+ __OM uint32_t SFSEC : 15;
+ __IM uint32_t RESERVED : 16;
+ __OM uint32_t ADD1SECEN : 1;
+ } SHIFT_B;
+ } ;
+
+ /** timestamp time register */
+ union
+ {
+ __IM uint32_t TSTIME;
+
+ struct
+ {
+ __IM uint32_t SECU : 4;
+ __IM uint32_t SECT : 3;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t MINU : 4;
+ __IM uint32_t MINT : 3;
+ __IM uint32_t RESERVED2 : 1;
+ __IM uint32_t HRU : 4;
+ __IM uint32_t HRT : 2;
+ __IM uint32_t TIMEFCFG : 1;
+ __IM uint32_t RESERVED3 : 9;
+ } TSTIME_B;
+ } ;
+
+ /** timestamp date register */
+ union
+ {
+ __IM uint32_t TSDATE;
+
+ struct
+ {
+ __IM uint32_t DAYU : 4;
+ __IM uint32_t DAYT : 2;
+ __IM uint32_t RESERVED1 : 2;
+ __IM uint32_t MONU : 4;
+ __IM uint32_t MONT : 1;
+ __IM uint32_t WEEKSEL : 3;
+ __IM uint32_t RESERVED2 : 16;
+ } TSDATE_B;
+ } ;
+
+ /** time-stamp sub second register */
+ union
+ {
+ __IM uint32_t TSSUBSEC;
+
+ struct
+ {
+ __IM uint32_t SUBSEC : 16;
+ __IM uint32_t RESERVED1 : 16;
+ } TSSUBSEC_B;
+ } ;
+
+ /** calibration register */
+ union
+ {
+ __IOM uint32_t CAL;
+
+ struct
+ {
+ __IOM uint32_t RECALF : 9;
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t CAL16CFG : 1;
+ __IOM uint32_t CAL8CFG : 1;
+ __IOM uint32_t ICALFEN : 1;
+ __IM uint32_t RESERVED2 : 16;
+ } CAL_B;
+ } ;
+
+ /** tamper and alternate function configuration register */
+ union
+ {
+ __IOM uint32_t TACFG;
+
+ struct
+ {
+ __IOM uint32_t TP1EN : 1;
+ __IOM uint32_t TP1ALCFG : 1;
+ __IOM uint32_t TPIEN : 1;
+ __IOM uint32_t TP2EN : 1;
+ __IOM uint32_t TP2ALCFG : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t TPTSEN : 1;
+ __IOM uint32_t TPSFSEL : 3;
+ __IOM uint32_t TPFCSEL : 2;
+ __IOM uint32_t TPPRDUSEL : 2;
+ __IOM uint32_t TPPUDIS : 1;
+ __IOM uint32_t TP1MSEL : 1;
+ __IOM uint32_t TSMSEL : 1;
+ __IOM uint32_t ALRMOT : 1;
+ __IM uint32_t RESERVED2 : 13;
+ } TACFG_B;
+ } ;
+
+ /** alarm A sub second register */
+ union
+ {
+ __IOM uint32_t ALRMASS;
+
+ struct
+ {
+ __IOM uint32_t SUBSEC : 15;
+ __IM uint32_t RESERVED1 : 9;
+ __IOM uint32_t MASKSEL : 4;
+ __IM uint32_t RESERVED2 : 4;
+ } ALRMASS_B;
+ } ;
+
+ /** alarm B sub second register */
+ union
+ {
+ __IOM uint32_t ALRMBSS;
+
+ struct
+ {
+ __IOM uint32_t SUBSEC : 15;
+ __IM uint32_t RESERVED1 : 9;
+ __IOM uint32_t MASKSEL : 4;
+ __IM uint32_t RESERVED2 : 4;
+ } ALRMBSS_B;
+ } ;
+
+ __IM uint32_t RESERVED;
+
+ /** backup register */
+ __IOM uint32_t BAKP[20];
+
+} RTC_T;
+
+/**
+ * @brief Power Management Unit(PMU)
+ */
+typedef struct
+{
+ /** @brief Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t LPDSCFG : 1;
+ __IOM uint32_t PDDSCFG : 1;
+ __IOM uint32_t WUFLGCLR : 1;
+ __IOM uint32_t SBFLGCLR : 1;
+ __IOM uint32_t PVDEN : 1;
+ __IOM uint32_t PLSEL : 3;
+ __IOM uint32_t BPWEN : 1;
+ __IOM uint32_t FPDSM : 1;
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t VOSSEL : 1;
+ __IM uint32_t RESERVED2 : 17;
+ } CTRL_B;
+ };
+
+ /** @brief PMU Status register */
+ union
+ {
+ __IOM uint32_t CSTS;
+
+ struct
+ {
+ __IM uint32_t WUEFLG : 1;
+ __IM uint32_t SBFLG : 1;
+ __IM uint32_t PVDOFLG : 1;
+ __IM uint32_t BKPRFLG : 1;
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t WKUPCFG : 1;
+ __IOM uint32_t BKPREN : 1;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t VOSRFLG : 1;
+ __IM uint32_t RESERVED3 : 17;
+ } CSTS_B;
+ };
+} PMU_T;
+
+/**
+ * @brief Timer register(TMR)
+ */
+typedef struct
+{
+ /** control register 1 */
+ union
+ {
+ __IOM uint32_t CTRL1;
+
+ struct
+ {
+ __IOM uint32_t CNTEN : 1;
+ __IOM uint32_t UD : 1;
+ __IOM uint32_t URSSEL : 1;
+ __IOM uint32_t SPMEN : 1;
+ __IOM uint32_t CNTDIR : 1;
+ __IOM uint32_t CAMSEL : 2;
+ __IOM uint32_t ARPEN : 1;
+ __IOM uint32_t CLKDIV : 2;
+ __IM uint32_t RESERVED : 22;
+ } CTRL1_B;
+ } ;
+
+ /** control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IOM uint32_t CCPEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t CCUSEL : 1;
+ __IOM uint32_t CCDSEL : 1;
+ __IOM uint32_t MMSEL : 3;
+ __IOM uint32_t TI1SEL : 1;
+ __IOM uint32_t OC1OIS : 1;
+ __IOM uint32_t OC1NOIS : 1;
+ __IOM uint32_t OC2OIS : 1;
+ __IOM uint32_t OC2NOIS : 1;
+ __IOM uint32_t OC3OIS : 1;
+ __IOM uint32_t OC3NOIS : 1;
+ __IOM uint32_t OC4OIS : 1;
+ __IM uint32_t RESERVED2 : 17;
+ } CTRL2_B;
+ } ;
+
+ /** slave mode control register */
+ union
+ {
+ __IOM uint32_t SMCTRL;
+
+ struct
+ {
+ __IOM uint32_t SMFSEL : 3;
+ __IOM uint32_t OCCSEL : 1;
+ __IOM uint32_t TRGSEL : 3;
+ __IOM uint32_t MSMEN : 1;
+ __IOM uint32_t ETFCFG : 4;
+ __IOM uint32_t ETPCFG : 2;
+ __IOM uint32_t ECEN : 1;
+ __IOM uint32_t ETPOL : 1;
+ __IM uint32_t RESERVED : 16;
+ } SMCTRL_B;
+ } ;
+
+ /** DMA/Interrupt enable register */
+ union
+ {
+ __IOM uint32_t DIEN;
+
+ struct
+ {
+ __IOM uint32_t UIEN : 1;
+ __IOM uint32_t CC1IEN : 1;
+ __IOM uint32_t CC2IEN : 1;
+ __IOM uint32_t CC3IEN : 1;
+ __IOM uint32_t CC4IEN : 1;
+ __IOM uint32_t COMIEN : 1;
+ __IOM uint32_t TRGIEN : 1;
+ __IOM uint32_t BRKIEN : 1;
+ __IOM uint32_t UDIEN : 1;
+ __IOM uint32_t CC1DEN : 1;
+ __IOM uint32_t CC2DEN : 1;
+ __IOM uint32_t CC3DEN : 1;
+ __IOM uint32_t CC4DEN : 1;
+ __IOM uint32_t COMDEN : 1;
+ __IOM uint32_t TRGDEN : 1;
+ __IM uint32_t RESERVED : 17;
+ } DIEN_B;
+ } ;
+
+ /** status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t UIFLG : 1;
+ __IOM uint32_t CC1IFLG : 1;
+ __IOM uint32_t CC2IFLG : 1;
+ __IOM uint32_t CC3IFLG : 1;
+ __IOM uint32_t CC4IFLG : 1;
+ __IOM uint32_t COMIFLG : 1;
+ __IOM uint32_t TRGIFLG : 1;
+ __IOM uint32_t BRKIFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t CC1RCFLG : 1;
+ __IOM uint32_t CC2RCFLG : 1;
+ __IOM uint32_t CC3RCFLG : 1;
+ __IOM uint32_t CC4RCFLG : 1;
+ __IM uint32_t RESERVED2 : 19;
+ } STS_B;
+ } ;
+
+ /** event generation register */
+ union
+ {
+ __OM uint32_t CEG;
+
+ struct
+ {
+ __OM uint32_t UEG : 1;
+ __OM uint32_t CC1EG : 1;
+ __OM uint32_t CC2EG : 1;
+ __OM uint32_t CC3EG : 1;
+ __OM uint32_t CC4EG : 1;
+ __OM uint32_t COMG : 1;
+ __OM uint32_t TEG : 1;
+ __OM uint32_t BEG : 1;
+ __IM uint32_t RESERVED : 24;
+ } CEG_B;
+ } ;
+
+ /** @brief Capture the compare mode register 1 */
+ union
+ {
+ __IOM uint32_t CCM1;
+
+ /** @brief Compare mode */
+ struct
+ {
+ __IOM uint32_t CC1SEL : 2;
+ __IOM uint32_t OC1FEN : 1;
+ __IOM uint32_t OC1PEN : 1;
+ __IOM uint32_t OC1MOD : 3;
+ __IOM uint32_t OC1CEN : 1;
+ __IOM uint32_t CC2SEL : 2;
+ __IOM uint32_t OC2FEN : 1;
+ __IOM uint32_t OC2PEN : 1;
+ __IOM uint32_t OC2MOD : 3;
+ __IOM uint32_t OC2CEN : 1;
+ __IM uint32_t RESERVED : 16;
+ } CCM1_COMPARE_B;
+
+ /** @brief Capture mode */
+ struct
+ {
+ __IOM uint32_t CC1SEL : 2;
+ __IOM uint32_t IC1PSC : 2;
+ __IOM uint32_t IC1F : 4;
+ __IOM uint32_t CC2SEL : 2;
+ __IOM uint32_t IC2PSC : 2;
+ __IOM uint32_t IC2F : 4;
+ __IM uint32_t RESERVED : 16;
+ } CCM1_CAPTURE_B;
+ };
+
+ /** @brief Capture the compare mode register 2 */
+ union
+ {
+ __IOM uint32_t CCM2;
+
+ /** @brief Compare mode */
+ struct
+ {
+ __IOM uint32_t CC3SEL : 2;
+ __IOM uint32_t OC3FEN : 1;
+ __IOM uint32_t OC3PEN : 1;
+ __IOM uint32_t OC3MOD : 3;
+ __IOM uint32_t OC3CEN : 1;
+ __IOM uint32_t CC4SEL : 2;
+ __IOM uint32_t OC4FEN : 1;
+ __IOM uint32_t OC4PEN : 1;
+ __IOM uint32_t OC4MOD : 3;
+ __IOM uint32_t OC4CEN : 1;
+ __IM uint32_t RESERVED : 16;
+ } CCM2_COMPARE_B;
+
+ /** @brief Capture mode */
+ struct
+ {
+ __IOM uint32_t CC3SEL : 2;
+ __IOM uint32_t IC3PSC : 2;
+ __IOM uint32_t IC3F : 4;
+ __IOM uint32_t CC4SEL : 2;
+ __IOM uint32_t IC4PSC : 2;
+ __IOM uint32_t IC4F : 4;
+ __IM uint32_t RESERVED : 16;
+ } CCM2_CAPTURE_B;
+ };
+
+ /** capture/compare enable register */
+ union
+ {
+ __IOM uint32_t CCEN;
+
+ struct
+ {
+ __IOM uint32_t CC1EN : 1;
+ __IOM uint32_t CC1POL : 1;
+ __IOM uint32_t CC1NEN : 1;
+ __IOM uint32_t CC1NPOL : 1;
+ __IOM uint32_t CC2EN : 1;
+ __IOM uint32_t CC2POL : 1;
+ __IOM uint32_t CC2NEN : 1;
+ __IOM uint32_t CC2NPOL : 1;
+ __IOM uint32_t CC3EN : 1;
+ __IOM uint32_t CC3POL : 1;
+ __IOM uint32_t CC3NEN : 1;
+ __IOM uint32_t CC3NPOL : 1;
+ __IOM uint32_t CC4EN : 1;
+ __IOM uint32_t CC4POL : 1;
+ __IM uint32_t RESERVED : 18;
+ } CCEN_B;
+ } ;
+
+ /** counter */
+ union
+ {
+ __IOM uint32_t CNT;
+
+ struct
+ {
+ __IOM uint32_t CNT : 32;
+ } CNT_B;
+ } ;
+
+ /** prescaler */
+ union
+ {
+ __IOM uint32_t PSC;
+
+ struct
+ {
+ __IOM uint32_t PSC : 16;
+ __IM uint32_t RESERVED : 16;
+ } PSC_B;
+ } ;
+
+ /** auto-reload register */
+ union
+ {
+ __IOM uint32_t AUTORLD;
+
+ struct
+ {
+ __IOM uint32_t AUTORLD : 32;
+ } AUTORLD_B;
+ } ;
+
+ /** repetition counter register */
+ union
+ {
+ __IOM uint32_t REPCNT;
+
+ struct
+ {
+ __IOM uint32_t REPCNT : 8;
+ __IM uint32_t RESERVED : 24;
+ } REPCNT_B;
+ } ;
+
+ /** capture/compare register 1 */
+ union
+ {
+ __IOM uint32_t CC1;
+
+ struct
+ {
+ __IOM uint32_t CC1 : 32;
+ } CC1_B;
+ } ;
+
+ /** capture/compare register 2 */
+ union
+ {
+ __IOM uint32_t CC2;
+
+ struct
+ {
+ __IOM uint32_t CC2 : 32;
+ } CC2_B;
+ } ;
+
+ /** capture/compare register 3 */
+ union
+ {
+ __IOM uint32_t CC3;
+
+ struct
+ {
+ __IOM uint32_t CC3 : 32;
+ } CC3_B;
+ } ;
+
+ /** capture/compare register 4 */
+ union
+ {
+ __IOM uint32_t CC4;
+
+ struct
+ {
+ __IOM uint32_t CC4 : 32;
+ } CC4_B;
+ } ;
+
+ /** break and dead-time register */
+ union
+ {
+ __IOM uint32_t BDT;
+
+ struct
+ {
+ __IOM uint32_t DTS : 8;
+ __IOM uint32_t LOCKCFG : 2;
+ __IOM uint32_t IMOS : 1;
+ __IOM uint32_t RMOS : 1;
+ __IOM uint32_t BRKEN : 1;
+ __IOM uint32_t BRKPOL : 1;
+ __IOM uint32_t AOEN : 1;
+ __IOM uint32_t MOEN : 1;
+ __IM uint32_t RESERVED : 16;
+ } BDT_B;
+ } ;
+
+ /** DMA control register */
+ union
+ {
+ __IOM uint32_t DCTRL;
+
+ struct
+ {
+ __IOM uint32_t DBADDR : 5;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t DBLEN : 5;
+ __IM uint32_t RESERVED2 : 19;
+ } DCTRL_B;
+ } ;
+
+ /** DMA address for full transfer */
+ union
+ {
+ __IOM uint32_t DMADDR;
+
+ struct
+ {
+ __IOM uint32_t DMADDR : 32;
+ } DMADDR_B;
+ } ;
+
+ /** Remap */
+ union
+ {
+ __IOM uint32_t OPT;
+
+ struct
+ {
+ __IOM uint32_t RMPSEL : 32;
+ } OPT_B;
+ } ;
+} TMR_T;
+
+/**
+ * @brief Direct Memory Access register(DMA)
+ */
+typedef struct
+{
+ union
+ {
+ __IM uint32_t LINTSTS;
+
+ struct
+ {
+ __IM uint32_t FEIFLG0 : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t DMEIFLG0 : 1;
+ __IM uint32_t TXEIFLG0 : 1;
+ __IM uint32_t HTXIFLG0 : 1;
+ __IM uint32_t TXCIFLG0 : 1;
+ __IM uint32_t FEIFLG1 : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IM uint32_t DMEIFLG1 : 1;
+ __IM uint32_t TXEIFLG1 : 1;
+ __IM uint32_t HTXIFLG1 : 1;
+ __IM uint32_t TXCIFLG1 : 1;
+ __IM uint32_t RESERVED3 : 4;
+ __IM uint32_t FEIFLG2 : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IM uint32_t DMEIFLG2 : 1;
+ __IM uint32_t TXEIFLG2 : 1;
+ __IM uint32_t HTXIFLG2 : 1;
+ __IM uint32_t TXCIFLG2 : 1;
+ __IM uint32_t FEIFLG3 : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IM uint32_t DMEIFLG3 : 1;
+ __IM uint32_t TXEIFLG3 : 1;
+ __IM uint32_t HTXIFLG3 : 1;
+ __IM uint32_t TXCIFLG3 : 1;
+ __IM uint32_t RESERVED6 : 4;
+ } LINTSTS_B;
+ };
+
+ union
+ {
+ __IM uint32_t HINTSTS;
+
+ struct
+ {
+ __IM uint32_t FEIFLG4 : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t DMEIFLG4 : 1;
+ __IM uint32_t TXEIFLG4 : 1;
+ __IM uint32_t HTXIFLG4 : 1;
+ __IM uint32_t TXCIFLG4 : 1;
+ __IM uint32_t FEIFLG5 : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IM uint32_t DMEIFLG5 : 1;
+ __IM uint32_t TXEIFLG5 : 1;
+ __IM uint32_t HTXIFLG5 : 1;
+ __IM uint32_t TXCIFLG5 : 1;
+ __IM uint32_t RESERVED3 : 4;
+ __IM uint32_t FEIFLG6 : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IM uint32_t DMEIFLG6 : 1;
+ __IM uint32_t TXEIFLG6 : 1;
+ __IM uint32_t HTXIFLG6 : 1;
+ __IM uint32_t TXCIFLG6 : 1;
+ __IM uint32_t FEIFLG7 : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IM uint32_t DMEIFLG7 : 1;
+ __IM uint32_t TXEIFLG7 : 1;
+ __IM uint32_t HTXIFLG7 : 1;
+ __IM uint32_t TXCIFLG7 : 1;
+ __IM uint32_t RESERVED6 : 4;
+ } HINTSTS_B;
+ };
+
+ union
+ {
+ __OM uint32_t LIFCLR;
+
+ struct
+ {
+ __OM uint32_t CFEIFLG0 : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __OM uint32_t CDMEIFLG0 : 1;
+ __OM uint32_t CTXEIFLG0 : 1;
+ __OM uint32_t CHTXIFLG0 : 1;
+ __OM uint32_t CTXCIFLG0 : 1;
+ __OM uint32_t CFEIFLG1 : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __OM uint32_t CDMEIFLG1 : 1;
+ __OM uint32_t CTXEIFLG1 : 1;
+ __OM uint32_t CHTXIFLG1 : 1;
+ __OM uint32_t CTXCIFLG1 : 1;
+ __IM uint32_t RESERVED3 : 4;
+ __OM uint32_t CFEIFLG2 : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __OM uint32_t CDMEIFLG2 : 1;
+ __OM uint32_t CTXEIFLG2 : 1;
+ __OM uint32_t CHTXIFLG2 : 1;
+ __OM uint32_t CTXCIFLG2 : 1;
+ __OM uint32_t CFEIFLG3 : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __OM uint32_t CDMEIFLG3 : 1;
+ __OM uint32_t CTXEIFLG3 : 1;
+ __OM uint32_t CHTXIFLG3 : 1;
+ __OM uint32_t CTXCIFLG3 : 1;
+ __IM uint32_t RESERVED6 : 4;
+ } LIFCLR_B;
+ };
+
+ union
+ {
+ __OM uint32_t HIFCLR;
+
+ struct
+ {
+ __OM uint32_t CFEIFLG4 : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __OM uint32_t CDMEIFLG4 : 1;
+ __OM uint32_t CTXEIFLG4 : 1;
+ __OM uint32_t CHTXIFLG4 : 1;
+ __OM uint32_t CTXCIFLG4 : 1;
+ __OM uint32_t CFEIFLG5 : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __OM uint32_t CDMEIFLG5 : 1;
+ __OM uint32_t CTXEIFLG5 : 1;
+ __OM uint32_t CHTXIFLG5 : 1;
+ __OM uint32_t CTXCIFLG5 : 1;
+ __IM uint32_t RESERVED3 : 4;
+ __OM uint32_t CFEIFLG6 : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __OM uint32_t CDMEIFLG6 : 1;
+ __OM uint32_t CTXEIFLG6 : 1;
+ __OM uint32_t CHTXIFLG6 : 1;
+ __OM uint32_t CTXCIFLG6 : 1;
+ __OM uint32_t CFEIFLG7 : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __OM uint32_t CDMEIFLG7 : 1;
+ __OM uint32_t CTXEIFLG7 : 1;
+ __OM uint32_t CHTXIFLG7 : 1;
+ __OM uint32_t CTXCIFLG7 : 1;
+ __IM uint32_t RESERVED6 : 4;
+ } HIFCLR_B;
+ };
+} DMA_T;
+
+/**
+ * @brief DMA Stream register
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t SCFG;
+
+ struct
+ {
+ __IOM uint32_t EN : 1;
+ __IOM uint32_t DMEIEN : 1;
+ __IOM uint32_t TXEIEN : 1;
+ __IOM uint32_t HTXIEN : 1;
+ __IOM uint32_t TXCIEN : 1;
+ __IOM uint32_t PERFC : 1;
+ __IOM uint32_t DIRCFG : 2;
+ __IOM uint32_t CIRCMEN : 1;
+ __IOM uint32_t PERIM : 1;
+ __IOM uint32_t MEMIM : 1;
+ __IOM uint32_t PERSIZECFG : 2;
+ __IOM uint32_t MEMSIZECFG : 2;
+ __IOM uint32_t PERIOSIZE : 1;
+ __IOM uint32_t PRILCFG : 2;
+ __IOM uint32_t DBM : 1;
+ __IOM uint32_t CTARG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t PBCFG : 2;
+ __IOM uint32_t MBCFG : 2;
+ __IOM uint32_t CHSEL : 3;
+ __IM uint32_t RESERVED2 : 4;
+ } SCFG_B;
+ };
+
+ union
+ {
+ __IOM uint32_t NDATA;
+
+ struct
+ {
+ __IOM uint32_t NDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } NDATA_B;
+ };
+
+ union
+ {
+ __IOM uint32_t PADDR;
+
+ struct
+ {
+ __IOM uint32_t PADDR : 32;
+ } PADDR_B;
+
+ };
+
+ union
+ {
+ __IOM uint32_t M0ADDR;
+
+ struct
+ {
+ __IOM uint32_t M0ADDR : 32;
+ } M0ADDR_B;
+
+ };
+
+ union
+ {
+ __IOM uint32_t M1ADDR;
+
+ struct
+ {
+ __IOM uint32_t M1ADDR : 32;
+ } M1ADDR_B;
+
+ };
+
+ union
+ {
+ __IOM uint32_t FCTRL;
+
+ struct
+ {
+ __IOM uint32_t FTHSEL : 2;
+ __IOM uint32_t DMDEN : 1;
+ __IM uint32_t FSTS : 3;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t FEIEN : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } FCTRL_B;
+ };
+} DMA_Stream_T;
+
+/**
+ * @brief CAN sending mailbox (CAN)
+ */
+typedef struct
+{
+ /** @brief CAN Each mailbox contains the sending mailbox identifier register */
+ union
+ {
+ __IOM uint32_t TXMID;
+
+ struct
+ {
+ __IOM uint32_t TXMREQ : 1;
+ __IOM uint32_t TXRFREQ : 1;
+ __IOM uint32_t IDTYPESEL : 1;
+ __IOM uint32_t EXTID : 18;
+ __IOM uint32_t STDID : 11;
+ } TXMID_B;
+ };
+
+ /** @brief CAN Send the mailbox data length and timestamp register */
+ union
+ {
+ __IOM uint32_t TXDLEN;
+
+ struct
+ {
+ __IOM uint32_t DLCODE : 4;
+ __IM uint32_t RESERVED : 28;
+ } TXDLEN_B;
+ };
+
+ /** @brief CAN Send mailbox low byte data register */
+ union
+ {
+ __IOM uint32_t TXMDL;
+
+ struct
+ {
+ __IOM uint32_t DATABYTE0 : 8;
+ __IOM uint32_t DATABYTE1 : 8;
+ __IOM uint32_t DATABYTE2 : 8;
+ __IOM uint32_t DATABYTE3 : 8;
+ } TXMDL_B;
+ };
+
+ /** @brief CAN Send mailbox High byte data register */
+ union
+ {
+ __IOM uint32_t TXMDH;
+
+ struct
+ {
+ __IOM uint32_t DATABYTE4 : 8;
+ __IOM uint32_t DATABYTE5 : 8;
+ __IOM uint32_t DATABYTE6 : 8;
+ __IOM uint32_t DATABYTE7 : 8;
+ } TXMDH_B;
+ };
+} CAN_TxMailBox_T;
+
+/**
+ * @brief CAN receive mailbox
+ */
+typedef struct
+{
+ /** @brief CAN Each mailbox contains the receive mailbox identifier register */
+ union
+ {
+ __IM uint32_t RXMID;
+
+ struct
+ {
+ __IM uint32_t RESERVED : 1;
+ __IM uint32_t RFTXREQ : 1;
+ __IM uint32_t IDTYPESEL : 1;
+ __IM uint32_t EXTID : 18;
+ __IM uint32_t STDID : 11;
+ } RXMID_B;
+ };
+
+ /** @brief CAN receive the mailbox data length and timestamp register */
+ union
+ {
+ __IM uint32_t RXDLEN;
+
+ struct
+ {
+ __IM uint32_t DLCODE : 4;
+ __IM uint32_t RESERVED1 : 4;
+ __IM uint32_t FMIDX : 8;
+ __IM uint32_t RESERVED2 : 16;
+ } RXDLEN_B;
+ };
+
+ /** @brief CAN receive mailbox low byte data register */
+ union
+ {
+ __IM uint32_t RXMDL;
+
+ struct
+ {
+ __IM uint32_t DATABYTE0 : 8;
+ __IM uint32_t DATABYTE1 : 8;
+ __IM uint32_t DATABYTE2 : 8;
+ __IM uint32_t DATABYTE3 : 8;
+ } RXMDL_B;
+ };
+
+ /** @briefCAN receive mailbox High byte data register */
+ union
+ {
+ __IOM uint32_t RXMDH;
+
+ struct
+ {
+ __IM uint32_t DATABYTE4 : 8;
+ __IM uint32_t DATABYTE5 : 8;
+ __IM uint32_t DATABYTE6 : 8;
+ __IM uint32_t DATABYTE7 : 8;
+ } RXMDH_B;
+ };
+} CAN_RxMailBox_T;
+
+/**
+ * @brief CAN Filter bank register
+ */
+typedef struct
+{
+ /** @brief CAN Filter bank register 1 */
+ union
+ {
+ __IOM uint32_t FBANK1;
+
+ struct
+ {
+ __IOM uint32_t FBIT0 : 1;
+ __IOM uint32_t FBIT1 : 1;
+ __IOM uint32_t FBIT2 : 1;
+ __IOM uint32_t FBIT3 : 1;
+ __IOM uint32_t FBIT4 : 1;
+ __IOM uint32_t FBIT5 : 1;
+ __IOM uint32_t FBIT6 : 1;
+ __IOM uint32_t FBIT7 : 1;
+ __IOM uint32_t FBIT8 : 1;
+ __IOM uint32_t FBIT9 : 1;
+ __IOM uint32_t FBIT10 : 1;
+ __IOM uint32_t FBIT11 : 1;
+ __IOM uint32_t FBIT12 : 1;
+ __IOM uint32_t FBIT13 : 1;
+ __IOM uint32_t FBIT14 : 1;
+ __IOM uint32_t FBIT15 : 1;
+ __IOM uint32_t FBIT16 : 1;
+ __IOM uint32_t FBIT17 : 1;
+ __IOM uint32_t FBIT18 : 1;
+ __IOM uint32_t FBIT19 : 1;
+ __IOM uint32_t FBIT20 : 1;
+ __IOM uint32_t FBIT21 : 1;
+ __IOM uint32_t FBIT22 : 1;
+ __IOM uint32_t FBIT23 : 1;
+ __IOM uint32_t FBIT24 : 1;
+ __IOM uint32_t FBIT25 : 1;
+ __IOM uint32_t FBIT26 : 1;
+ __IOM uint32_t FBIT27 : 1;
+ __IOM uint32_t FBIT28 : 1;
+ __IOM uint32_t FBIT29 : 1;
+ __IOM uint32_t FBIT30 : 1;
+ __IOM uint32_t FBIT31 : 1;
+ } FBANK1_B;
+ };
+
+ /** @brief CAN Filter bank register 2 */
+ union
+ {
+ __IOM uint32_t FBANK2;
+
+ struct
+ {
+ __IOM uint32_t FBIT0 : 1;
+ __IOM uint32_t FBIT1 : 1;
+ __IOM uint32_t FBIT2 : 1;
+ __IOM uint32_t FBIT3 : 1;
+ __IOM uint32_t FBIT4 : 1;
+ __IOM uint32_t FBIT5 : 1;
+ __IOM uint32_t FBIT6 : 1;
+ __IOM uint32_t FBIT7 : 1;
+ __IOM uint32_t FBIT8 : 1;
+ __IOM uint32_t FBIT9 : 1;
+ __IOM uint32_t FBIT10 : 1;
+ __IOM uint32_t FBIT11 : 1;
+ __IOM uint32_t FBIT12 : 1;
+ __IOM uint32_t FBIT13 : 1;
+ __IOM uint32_t FBIT14 : 1;
+ __IOM uint32_t FBIT15 : 1;
+ __IOM uint32_t FBIT16 : 1;
+ __IOM uint32_t FBIT17 : 1;
+ __IOM uint32_t FBIT18 : 1;
+ __IOM uint32_t FBIT19 : 1;
+ __IOM uint32_t FBIT20 : 1;
+ __IOM uint32_t FBIT21 : 1;
+ __IOM uint32_t FBIT22 : 1;
+ __IOM uint32_t FBIT23 : 1;
+ __IOM uint32_t FBIT24 : 1;
+ __IOM uint32_t FBIT25 : 1;
+ __IOM uint32_t FBIT26 : 1;
+ __IOM uint32_t FBIT27 : 1;
+ __IOM uint32_t FBIT28 : 1;
+ __IOM uint32_t FBIT29 : 1;
+ __IOM uint32_t FBIT30 : 1;
+ __IOM uint32_t FBIT31 : 1;
+ } FBANK2_B;
+ };
+} CAN_FilterRegister_T;
+
+/**
+ * @brief Controller Area Network(CAN)
+ */
+typedef struct
+{
+ /** @brief CAN Master control register */
+ union
+ {
+ __IOM uint32_t MCTRL;
+
+ struct
+ {
+ __IOM uint32_t INITREQ : 1;
+ __IOM uint32_t SLEEPREQ : 1;
+ __IOM uint32_t TXFPCFG : 1;
+ __IOM uint32_t RXFLOCK : 1;
+ __IOM uint32_t ARTXMD : 1;
+ __IOM uint32_t AWUPCFG : 1;
+ __IOM uint32_t ALBOFFM : 1;
+ __IM uint32_t RESERVED1 : 8;
+ __IOM uint32_t SWRST : 1;
+ __IOM uint32_t DBGFRZE : 1;
+ __IM uint32_t RESERVED2 : 15;
+ } MCTRL_B;
+ };
+
+ /** @brief CAN Master States register */
+ union
+ {
+ __IOM uint32_t MSTS;
+
+ struct
+ {
+ __IM uint32_t INITFLG : 1;
+ __IM uint32_t SLEEPFLG : 1;
+ __IOM uint32_t ERRIFLG : 1;
+ __IOM uint32_t WUPIFLG : 1;
+ __IOM uint32_t SLEEPIFLG : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IM uint32_t TXMFLG : 1;
+ __IM uint32_t RXMFLG : 1;
+ __IM uint32_t LSAMVALUE : 1;
+ __IM uint32_t RXSIGL : 1;
+ __IM uint32_t RESERVED2 : 20;
+ } MSTS_B;
+ };
+
+ /** @brief CAN Send States register */
+ union
+ {
+ __IOM uint32_t TXSTS;
+
+ struct
+ {
+ __IOM uint32_t REQCFLG0 : 1;
+ __IOM uint32_t TXSUSFLG0 : 1;
+ __IOM uint32_t ARBLSTFLG0 : 1;
+ __IOM uint32_t TXERRFLG0 : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t ABREQFLG0 : 1;
+ __IOM uint32_t REQCFLG1 : 1;
+ __IOM uint32_t TXSUSFLG1 : 1;
+ __IOM uint32_t ARBLSTFLG1 : 1;
+ __IOM uint32_t TXERRFLG1 : 1;
+ __IM uint32_t RESERVED2 : 3;
+ __IOM uint32_t ABREQFLG1 : 1;
+ __IOM uint32_t REQCFLG2 : 1;
+ __IOM uint32_t TXSUSFLG2 : 1;
+ __IOM uint32_t ARBLSTFLG2 : 1;
+ __IOM uint32_t TXERRFLG2 : 1;
+ __IM uint32_t RESERVED3 : 3;
+ __IOM uint32_t ABREQFLG2 : 1;
+ __IM uint32_t EMNUM : 2;
+ __IM uint32_t TXMEFLG0 : 1;
+ __IM uint32_t TXMEFLG1 : 1;
+ __IM uint32_t TXMEFLG2 : 1;
+ __IM uint32_t LOWESTP0 : 1;
+ __IM uint32_t LOWESTP1 : 1;
+ __IM uint32_t LOWESTP2 : 1;
+ } TXSTS_B;
+ };
+
+ /** @brief CAN Receive FIFO 0 register */
+ union
+ {
+ __IOM uint32_t RXF0;
+
+ struct
+ {
+ __IM uint32_t FMNUM0 : 2;
+ __IM uint32_t RESERVED : 1;
+ __IOM uint32_t FFULLFLG0 : 1;
+ __IOM uint32_t FOVRFLG0 : 1;
+ __IOM uint32_t RFOM0 : 1;
+ __IM uint32_t RESERVED1 : 26;
+ } RXF0_B;
+ };
+
+ /** @brief CAN Receive FIFO 1 register */
+ union
+ {
+ __IOM uint32_t RXF1;
+
+ struct
+ {
+ __IM uint32_t FMNUM1 : 2;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t FFULLFLG1 : 1;
+ __IOM uint32_t FOVRFLG1 : 1;
+ __IOM uint32_t RFOM1 : 1;
+ __IM uint32_t RESERVED2 : 26;
+ } RXF1_B;
+ };
+
+ /** @brief CAN Interrupts register */
+ union
+ {
+ __IOM uint32_t INTEN;
+
+ struct
+ {
+ __IOM uint32_t TXMEIEN : 1;
+ __IOM uint32_t FMIEN0 : 1;
+ __IOM uint32_t FFULLIEN0 : 1;
+ __IOM uint32_t FOVRIEN0 : 1;
+ __IOM uint32_t FMIEN1 : 1;
+ __IOM uint32_t FFULLIEN1 : 1;
+ __IOM uint32_t FOVRIEN1 : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t ERRWIEN : 1;
+ __IOM uint32_t ERRPIEN : 1;
+ __IOM uint32_t BOFFIEN : 1;
+ __IOM uint32_t LECIEN : 1;
+ __IM uint32_t RESERVED2 : 3;
+ __IOM uint32_t ERRIEN : 1;
+ __IOM uint32_t WUPIEN : 1;
+ __IOM uint32_t SLEEPIEN : 1;
+ __IM uint32_t RESERVED3 : 14;
+ } INTEN_B;
+ };
+
+ /** @brief CAN Error States register */
+ union
+ {
+ __IOM uint32_t ERRSTS;
+
+ struct
+ {
+ __IM uint32_t ERRWFLG : 1;
+ __IM uint32_t ERRPFLG : 1;
+ __IM uint32_t BOFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t LERRC : 3;
+ __IM uint32_t RESERVED2 : 9;
+ __IM uint32_t TXERRCNT : 8;
+ __IM uint32_t RXERRCNT : 8;
+ } ERRSTS_B;
+ };
+
+ /** @brief CAN Bit Time register */
+ union
+ {
+ __IOM uint32_t BITTIM;
+
+ struct
+ {
+ __IOM uint32_t BRPSC : 10;
+ __IM uint32_t RESERVED1 : 6;
+ __IOM uint32_t TIMSEG1 : 4;
+ __IOM uint32_t TIMSEG2 : 3;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t RSYNJW : 2;
+ __IM uint32_t RESERVED3 : 4;
+ __IOM uint32_t LBKMEN : 1;
+ __IOM uint32_t SILMEN : 1;
+ } BITTIM_B;
+ };
+
+ __IM uint32_t RESERVED0[88];
+
+ CAN_TxMailBox_T sTxMailBox[3];
+ CAN_RxMailBox_T sRxMailBox[2];
+
+ __IM uint32_t RESERVED1[12];
+
+ /** @brief CAN Filter the master control register */
+ union
+ {
+ __IOM uint32_t FCTRL;
+
+ struct
+ {
+ __IOM uint32_t FINITEN : 1;
+ __IM uint32_t RESERVED : 7;
+ __IOM uint32_t CAN2BN : 6;
+ __IM uint32_t RESERVED1 : 18;
+ } FCTRL_B;
+ };
+
+ /** @brief CAN Filter register */
+ union
+ {
+ __IOM uint32_t FMCFG;
+
+ struct
+ {
+ __IOM uint32_t FMCFG0 : 1;
+ __IOM uint32_t FMCFG1 : 1;
+ __IOM uint32_t FMCFG2 : 1;
+ __IOM uint32_t FMCFG3 : 1;
+ __IOM uint32_t FMCFG4 : 1;
+ __IOM uint32_t FMCFG5 : 1;
+ __IOM uint32_t FMCFG6 : 1;
+ __IOM uint32_t FMCFG7 : 1;
+ __IOM uint32_t FMCFG8 : 1;
+ __IOM uint32_t FMCFG9 : 1;
+ __IOM uint32_t FMCFG10 : 1;
+ __IOM uint32_t FMCFG11 : 1;
+ __IOM uint32_t FMCFG12 : 1;
+ __IOM uint32_t FMCFG13 : 1;
+ __IOM uint32_t FMCFG14 : 1;
+ __IOM uint32_t FMCFG15 : 1;
+ __IOM uint32_t FMCFG16 : 1;
+ __IOM uint32_t FMCFG17 : 1;
+ __IOM uint32_t FMCFG18 : 1;
+ __IOM uint32_t FMCFG19 : 1;
+ __IOM uint32_t FMCFG20 : 1;
+ __IOM uint32_t FMCFG21 : 1;
+ __IOM uint32_t FMCFG22 : 1;
+ __IOM uint32_t FMCFG23 : 1;
+ __IOM uint32_t FMCFG24 : 1;
+ __IOM uint32_t FMCFG25 : 1;
+ __IOM uint32_t FMCFG26 : 1;
+ __IOM uint32_t FMCFG27 : 1;
+ __IM uint32_t RESERVED : 4;
+ } FMCFG_B;
+ };
+
+ __IM uint32_t RESERVED2;
+
+ /** @brief CAN Filter bit scale register */
+ union
+ {
+ __IOM uint32_t FSCFG;
+
+ struct
+ {
+ __IOM uint32_t FSCFG0 : 1;
+ __IOM uint32_t FSCFG1 : 1;
+ __IOM uint32_t FSCFG2 : 1;
+ __IOM uint32_t FSCFG3 : 1;
+ __IOM uint32_t FSCFG4 : 1;
+ __IOM uint32_t FSCFG5 : 1;
+ __IOM uint32_t FSCFG6 : 1;
+ __IOM uint32_t FSCFG7 : 1;
+ __IOM uint32_t FSCFG8 : 1;
+ __IOM uint32_t FSCFG9 : 1;
+ __IOM uint32_t FSCFG10 : 1;
+ __IOM uint32_t FSCFG11 : 1;
+ __IOM uint32_t FSCFG12 : 1;
+ __IOM uint32_t FSCFG13 : 1;
+ __IOM uint32_t FSCFG14 : 1;
+ __IOM uint32_t FSCFG15 : 1;
+ __IOM uint32_t FSCFG16 : 1;
+ __IOM uint32_t FSCFG17 : 1;
+ __IOM uint32_t FSCFG18 : 1;
+ __IOM uint32_t FSCFG19 : 1;
+ __IOM uint32_t FSCFG20 : 1;
+ __IOM uint32_t FSCFG21 : 1;
+ __IOM uint32_t FSCFG22 : 1;
+ __IOM uint32_t FSCFG23 : 1;
+ __IOM uint32_t FSCFG24 : 1;
+ __IOM uint32_t FSCFG25 : 1;
+ __IOM uint32_t FSCFG26 : 1;
+ __IOM uint32_t FSCFG27 : 1;
+ __IM uint32_t RESERVED : 4;
+ } FSCFG_B;
+ };
+
+ __IM uint32_t RESERVED3;
+
+ /** @brief CAN Filter FIFO associated registers */
+ union
+ {
+ __IOM uint32_t FFASS;
+
+ struct
+ {
+ __IOM uint32_t FFASS0 : 1;
+ __IOM uint32_t FFASS1 : 1;
+ __IOM uint32_t FFASS2 : 1;
+ __IOM uint32_t FFASS3 : 1;
+ __IOM uint32_t FFASS4 : 1;
+ __IOM uint32_t FFASS5 : 1;
+ __IOM uint32_t FFASS6 : 1;
+ __IOM uint32_t FFASS7 : 1;
+ __IOM uint32_t FFASS8 : 1;
+ __IOM uint32_t FFASS9 : 1;
+ __IOM uint32_t FFASS10 : 1;
+ __IOM uint32_t FFASS11 : 1;
+ __IOM uint32_t FFASS12 : 1;
+ __IOM uint32_t FFASS13 : 1;
+ __IOM uint32_t FFASS14 : 1;
+ __IOM uint32_t FFASS15 : 1;
+ __IOM uint32_t FFASS16 : 1;
+ __IOM uint32_t FFASS17 : 1;
+ __IOM uint32_t FFASS18 : 1;
+ __IOM uint32_t FFASS19 : 1;
+ __IOM uint32_t FFASS20 : 1;
+ __IOM uint32_t FFASS21 : 1;
+ __IOM uint32_t FFASS22 : 1;
+ __IOM uint32_t FFASS23 : 1;
+ __IOM uint32_t FFASS24 : 1;
+ __IOM uint32_t FFASS25 : 1;
+ __IOM uint32_t FFASS26 : 1;
+ __IOM uint32_t FFASS27 : 1;
+ __IM uint32_t RESERVED : 4;
+ } FFASS_B;
+ };
+
+ __IM uint32_t RESERVED4;
+
+ /** @brief CAN Filter activation register */
+ union
+ {
+ __IOM uint32_t FACT;
+
+ struct
+ {
+ __IOM uint32_t FACT0 : 1;
+ __IOM uint32_t FACT1 : 1;
+ __IOM uint32_t FACT2 : 1;
+ __IOM uint32_t FACT3 : 1;
+ __IOM uint32_t FACT4 : 1;
+ __IOM uint32_t FACT5 : 1;
+ __IOM uint32_t FACT6 : 1;
+ __IOM uint32_t FACT7 : 1;
+ __IOM uint32_t FACT8 : 1;
+ __IOM uint32_t FACT9 : 1;
+ __IOM uint32_t FACT10 : 1;
+ __IOM uint32_t FACT11 : 1;
+ __IOM uint32_t FACT12 : 1;
+ __IOM uint32_t FACT13 : 1;
+ __IOM uint32_t FACT14 : 1;
+ __IOM uint32_t FACT15 : 1;
+ __IOM uint32_t FACT16 : 1;
+ __IOM uint32_t FACT17 : 1;
+ __IOM uint32_t FACT18 : 1;
+ __IOM uint32_t FACT19 : 1;
+ __IOM uint32_t FACT20 : 1;
+ __IOM uint32_t FACT21 : 1;
+ __IOM uint32_t FACT22 : 1;
+ __IOM uint32_t FACT23 : 1;
+ __IOM uint32_t FACT24 : 1;
+ __IOM uint32_t FACT25 : 1;
+ __IOM uint32_t FACT26 : 1;
+ __IOM uint32_t FACT27 : 1;
+ __IM uint32_t RESERVED : 4;
+ } FACT_B;
+ };
+
+ __IM uint32_t RESERVED5[8];
+
+ CAN_FilterRegister_T sFilterRegister[28];
+} CAN_T;
+
+/**
+ * @brief I2C register (I2C)
+ */
+typedef struct
+{
+ /** @brief Control register 1 */
+ union
+ {
+ __IOM uint32_t CTRL1;
+
+ struct
+ {
+ __IOM uint32_t I2CEN : 1;
+ __IOM uint32_t SMBEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t SMBTCFG : 1;
+ __IOM uint32_t ARPEN : 1;
+ __IOM uint32_t PECEN : 1;
+ __IOM uint32_t SRBEN : 1;
+ __IOM uint32_t CLKSTRETCHD : 1;
+ __IOM uint32_t START : 1;
+ __IOM uint32_t STOP : 1;
+ __IOM uint32_t ACKEN : 1;
+ __IOM uint32_t ACKPOS : 1;
+ __IOM uint32_t PEC : 1;
+ __IOM uint32_t ALERTEN : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t SWRST : 1;
+ __IM uint32_t RESERVED3 : 16;
+ } CTRL1_B;
+ } ;
+
+ /** @brief Control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IOM uint32_t CLKFCFG : 6;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t ERRIEN : 1;
+ __IOM uint32_t EVIEN : 1;
+ __IOM uint32_t BUFIEN : 1;
+ __IOM uint32_t DMAEN : 1;
+ __IOM uint32_t LTCFG : 1;
+ __IM uint32_t RESERVED2 : 19;
+ } CTRL2_B;
+ } ;
+
+ /** @brief Slave machine address register 1 */
+ union
+ {
+ __IOM uint32_t SADDR1;
+
+ struct
+ {
+ __IOM uint32_t ADDR0 : 1;
+ __IOM uint32_t ADDR1_7 : 7;
+ __IOM uint32_t ADDR8_9 : 2;
+ __IM uint32_t RESERVED1 : 5;
+ __IOM uint32_t ADDRLEN : 1;
+ __IM uint32_t RESERVED2 : 16;
+ } SADDR1_B;
+ };
+
+ /** @brief Slave machine address register 2 */
+ union
+ {
+ __IOM uint32_t SADDR2;
+
+ struct
+ {
+ __IOM uint32_t ADDRNUM : 1;
+ __IOM uint32_t ADDR2 : 7;
+ __IM uint32_t RESERVED : 24;
+ } SADDR2_B;
+ };
+
+ /** @brief Cache data register */
+ union
+ {
+ __IOM uint32_t DATA;
+
+ struct
+ {
+ __IOM uint32_t DATA : 8;
+ __IM uint32_t RESERVED : 24;
+ } DATA_B;
+ };
+
+ /** @brief Status register 1 */
+ union
+ {
+ __IOM uint32_t STS1;
+
+ struct
+ {
+ __IM uint32_t STARTFLG : 1;
+ __IM uint32_t ADDRFLG : 1;
+ __IM uint32_t BTCFLG : 1;
+ __IM uint32_t ADDR10FLG : 1;
+ __IM uint32_t STOPFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t RXBNEFLG : 1;
+ __IM uint32_t TXBEFLG : 1;
+ __IOM uint32_t BERRFLG : 1;
+ __IOM uint32_t ALFLG : 1;
+ __IOM uint32_t AEFLG : 1;
+ __IOM uint32_t OVRURFLG : 1;
+ __IOM uint32_t PECEFLG : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t TTEFLG : 1;
+ __IOM uint32_t SMBALTFLG : 1;
+ __IM uint32_t RESERVED3 : 16;
+ } STS1_B;
+ };
+
+ /** @brief Status register 2 */
+ union
+ {
+ __IM uint32_t STS2;
+
+ struct
+ {
+ __IM uint32_t MSFLG : 1;
+ __IM uint32_t BUSBSYFLG : 1;
+ __IM uint32_t TRFLG : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t GENCALLFLG : 1;
+ __IM uint32_t SMBDADDRFLG : 1;
+ __IM uint32_t SMMHADDR : 1;
+ __IM uint32_t DUALADDRFLG : 1;
+ __IM uint32_t PECVALUE : 8;
+ __IM uint32_t RESERVED2 : 16;
+ } STS2_B;
+ };
+
+ /** @brief Clock control register */
+ union
+ {
+ __IOM uint32_t CLKCTRL;
+
+ struct
+ {
+ __IOM uint32_t CLKS : 12;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t FDUTYCFG : 1;
+ __IOM uint32_t SPEEDCFG : 1;
+ __IM uint32_t RESERVED2 : 16;
+ } CLKCTRL_B;
+ };
+
+ /** @brief Maximum rise time */
+ union
+ {
+ __IOM uint32_t RISETMAX;
+
+ struct
+ {
+ __IOM uint32_t RISETMAX : 6;
+ __IM uint32_t RESERVED : 26;
+ } RISETMAX_B;
+ };
+} I2C_T;
+
+/**
+ * @brief Analog to Digital Converter(ADC)
+ */
+typedef struct
+{
+ /** interrupt and status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t AWDFLG : 1;
+ __IOM uint32_t EOCFLG : 1;
+ __IOM uint32_t INJEOCFLG : 1;
+ __IOM uint32_t INJCSFLG : 1;
+ __IOM uint32_t REGCSFLG : 1;
+ __IOM uint32_t OVREFLG : 1;
+ __IM uint32_t RESERVED1 : 26;
+ } STS_B;
+ } ;
+
+ /** Control register1*/
+
+ union
+ {
+ __IOM uint32_t CTRL1;
+
+ struct
+ {
+ __IOM uint32_t AWDCHSEL : 5;
+ __IOM uint32_t EOCIEN : 1;
+ __IOM uint32_t AWDIEN : 1;
+ __IOM uint32_t INJEOCIEN : 1;
+ __IOM uint32_t SCANEN : 1;
+ __IOM uint32_t AWDSGLEN : 1;
+ __IOM uint32_t INJGACEN : 1;
+ __IOM uint32_t REGDISCEN : 1;
+ __IOM uint32_t INJDISCEN : 1;
+ __IOM uint32_t DISCNUMCFG : 3;
+ __IM uint32_t RESERVED1 : 6;
+ __IOM uint32_t INJAWDEN : 1;
+ __IOM uint32_t REGAWDEN : 1;
+ __IOM uint32_t RESSEL : 2;
+ __IOM uint32_t OVRIEN : 1;
+ __IM uint32_t RESERVED2 : 5;
+ } CTRL1_B;
+ } ;
+ /** Control register2*/
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IOM uint32_t ADCEN : 1;
+ __IOM uint32_t CONTCEN : 1;
+ __IM uint32_t RESERVED1 : 6;
+ __IOM uint32_t DMAEN : 1;
+ __IOM uint32_t DMADISSEL : 1;
+ __IOM uint32_t EOCSEL : 1;
+ __IOM uint32_t DALIGNCFG : 1;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t INJGEXTTRGSEL : 4;
+ __IOM uint32_t INJEXTTRGEN : 2;
+ __IOM uint32_t INJCHSC : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t REGEXTTRGSEL : 4;
+ __IOM uint32_t REGEXTTRGEN : 2;
+ __IOM uint32_t REGCHSC : 1;
+ __IM uint32_t RESERVED4 : 1;
+ } CTRL2_B;
+ } ;
+
+ /** Sample time register1*/
+ union
+ {
+ __IOM uint32_t SMPTIM1;
+
+ struct
+ {
+ __IOM uint32_t SMPCYCCFG10 : 3;
+ __IOM uint32_t SMPCYCCFG11 : 3;
+ __IOM uint32_t SMPCYCCFG12 : 3;
+ __IOM uint32_t SMPCYCCFG13 : 3;
+ __IOM uint32_t SMPCYCCFG14 : 3;
+ __IOM uint32_t SMPCYCCFG15 : 3;
+ __IOM uint32_t SMPCYCCFG16 : 3;
+ __IOM uint32_t SMPCYCCFG17 : 3;
+ __IOM uint32_t SMPCYCCFG18 : 3;
+ __IM uint32_t RESERVED : 5;
+ } SMPTIM1_B;
+ };
+
+ /** Sample time register2*/
+ union
+ {
+ __IOM uint32_t SMPTIM2;
+
+ struct
+ {
+ __IOM uint32_t SMPCYCCFG0 : 3;
+ __IOM uint32_t SMPCYCCFG1 : 3;
+ __IOM uint32_t SMPCYCCFG2 : 3;
+ __IOM uint32_t SMPCYCCFG3 : 3;
+ __IOM uint32_t SMPCYCCFG4 : 3;
+ __IOM uint32_t SMPCYCCFG5 : 3;
+ __IOM uint32_t SMPCYCCFG6 : 3;
+ __IOM uint32_t SMPCYCCFG7 : 3;
+ __IOM uint32_t SMPCYCCFG8 : 3;
+ __IOM uint32_t SMPCYCCFG9 : 3;
+ __IM uint32_t RESERVED : 2;
+ } SMPTIM2_B;
+ };
+
+ /** Injected channel Data offset register1*/
+ union
+ {
+ __IOM uint32_t INJDOF1;
+
+ struct
+ {
+ __IOM uint32_t INJDOF1 : 12;
+ __IM uint32_t RESERVED : 20;
+ } INJDOF1_B;
+ };
+
+ /** Injected channel Data offset register2*/
+ union
+ {
+ __IOM uint32_t INJDOF2;
+
+ struct
+ {
+ __IOM uint32_t INJDOF2 : 12;
+ __IM uint32_t RESERVED : 20;
+ } INJDOF2_B;
+ };
+
+ /** Injected channel Data offset register3*/
+ union
+ {
+ __IOM uint32_t INJDOF3;
+
+ struct
+ {
+ __IOM uint32_t INJDOF3 : 12;
+ __IM uint32_t RESERVED : 20;
+ } INJDOF3_B;
+ };
+
+ /** Injected channel Data offset register4*/
+ union
+ {
+ __IOM uint32_t INJDOF4;
+
+ struct
+ {
+ __IOM uint32_t INJDOF4 : 12;
+ __IM uint32_t RESERVED : 20;
+ } INJDOF4_B;
+ };
+
+ /** Analog watchdog high threshold register*/
+ union
+ {
+ __IOM uint32_t AWDHT;
+
+ struct
+ {
+ __IOM uint32_t AWDHT : 12;
+ __IM uint32_t RESERVED : 20;
+ } AWDHT_B;
+ };
+
+ /** Analog watchdog low threshold register*/
+ union
+ {
+ __IOM uint32_t AWDLT;
+
+ struct
+ {
+ __IOM uint32_t AWDLT : 12;
+ __IM uint32_t RESERVED : 20;
+ } AWDLT_B;
+ };
+
+ /** Regular channel sequence register1*/
+ union
+ {
+ __IOM uint32_t REGSEQ1;
+
+ struct
+ {
+ __IOM uint32_t REGSEQC13 : 5;
+ __IOM uint32_t REGSEQC14 : 5;
+ __IOM uint32_t REGSEQC15 : 5;
+ __IOM uint32_t REGSEQC16 : 5;
+ __IOM uint32_t REGSEQLEN : 4;
+ __IM uint32_t RESERVED : 8;
+ } REGSEQ1_B;
+ };
+
+ /** Regular channel sequence register2*/
+ union
+ {
+ __IOM uint32_t REGSEQ2;
+
+ struct
+ {
+ __IOM uint32_t REGSEQC7 : 5;
+ __IOM uint32_t REGSEQC8 : 5;
+ __IOM uint32_t REGSEQC9 : 5;
+ __IOM uint32_t REGSEQC10 : 5;
+ __IOM uint32_t REGSEQC11 : 5;
+ __IOM uint32_t REGSEQC12 : 5;
+ __IM uint32_t RESERVED : 2;
+ } REGSEQ2_B;
+ };
+
+ /** Regular channel sequence register3*/
+ union
+ {
+ __IOM uint32_t REGSEQ3;
+
+ struct
+ {
+ __IOM uint32_t REGSEQC1 : 5;
+ __IOM uint32_t REGSEQC2 : 5;
+ __IOM uint32_t REGSEQC3 : 5;
+ __IOM uint32_t REGSEQC4 : 5;
+ __IOM uint32_t REGSEQC5 : 5;
+ __IOM uint32_t REGSEQC6 : 5;
+ __IM uint32_t RESERVED : 2;
+ } REGSEQ3_B;
+ };
+
+ /** Injected sequence register*/
+ union
+ {
+ __IOM uint32_t INJSEQ;
+
+ struct
+ {
+ __IOM uint32_t INJSEQC1 : 5;
+ __IOM uint32_t INJSEQC2 : 5;
+ __IOM uint32_t INJSEQC3 : 5;
+ __IOM uint32_t INJSEQC4 : 5;
+ __IOM uint32_t INJSEQLEN : 2;
+ __IM uint32_t RESERVED : 10;
+ } INJSEQ_B;
+ };
+
+ /** Injected Data register1*/
+ union
+ {
+ __IM uint32_t INJDATA1;
+
+ struct
+ {
+ __IM uint32_t INJDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } INJDATA1_B;
+ };
+
+ /** Injected Data register2*/
+ union
+ {
+ __IM uint32_t INJDATA2;
+
+ struct
+ {
+ __IM uint32_t INJDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } INJDATA2_B;
+ };
+
+ /** Injected Data register3*/
+ union
+ {
+ __IM uint32_t INJDATA3;
+
+ struct
+ {
+ __IM uint32_t INJDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } INJDATA3_B;
+ };
+
+ /** Injected Data register4*/
+ union
+ {
+ __IM uint32_t INJDATA4;
+
+ struct
+ {
+ __IM uint32_t INJDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } INJDATA4_B;
+ };
+
+ /** Regular Data register*/
+ union
+ {
+ __IOM uint32_t REGDATA;
+
+ struct
+ {
+ __IM uint32_t REGDATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } REGDATA_B;
+ };
+} ADC_T;
+
+typedef struct
+{
+ /**Common status register*/
+ union
+ {
+ __IOM uint32_t CSTS;
+
+ struct
+ {
+ __IM uint32_t AWDFLG1 : 1;
+ __IM uint32_t EOCFLG1 : 1;
+ __IM uint32_t INJEOCFLG1 : 1;
+ __IM uint32_t INJCSFLG1 : 1;
+ __IM uint32_t REGCSFLG1 : 1;
+ __IM uint32_t OVRFLG1 : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IM uint32_t AWDFLG2 : 1;
+ __IM uint32_t EOCFLG2 : 1;
+ __IM uint32_t INJEOCFLG2 : 1;
+ __IM uint32_t INJCSFLG2 : 1;
+ __IM uint32_t REGCSFLG2 : 1;
+ __IM uint32_t OVRFLG2 : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IM uint32_t AWDFLG3 : 1;
+ __IM uint32_t EOCFLG3 : 1;
+ __IM uint32_t INJEOCFLG3 : 1;
+ __IM uint32_t INJCSFLG3 : 1;
+ __IM uint32_t REGCSFLG3 : 1;
+ __IM uint32_t OVRFLG3 : 1;
+ __IM uint32_t RESERVED3 : 2;
+ } CSTS_B;
+ } ;
+
+ /** Common control register*/
+ union
+ {
+ __IOM uint32_t CCTRL;
+
+ struct
+ {
+ __IOM uint32_t ADCMSEL : 5;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t SMPDEL2 : 4;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t DMADISSEL : 1;
+ __IOM uint32_t DMAMODE : 2;
+ __IOM uint32_t ADCPRE : 2;
+ __IM uint32_t RESERVED3 : 4;
+ __IOM uint32_t VBATEN : 1;
+ __IOM uint32_t TSVREFEN : 1;
+ __IM uint32_t RESERVED4 : 8;
+ } CCTRL_B;
+ } ;
+
+ /** Common regular data register for dual and triple modes */
+ union
+ {
+ __IOM uint32_t CDATA;
+
+ struct
+ {
+ __IM uint32_t DATA1 : 16;
+ __IM uint32_t DATA2 : 16;
+ } CDATA_B;
+ } ;
+} ADC_Common_T;
+
+/**
+ * @brief External Interrupt/Event Controller (EINT)
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t IMASK;
+
+ struct
+ {
+ __IOM uint32_t IMASK0 : 1;
+ __IOM uint32_t IMASK1 : 1;
+ __IOM uint32_t IMASK2 : 1;
+ __IOM uint32_t IMASK3 : 1;
+ __IOM uint32_t IMASK4 : 1;
+ __IOM uint32_t IMASK5 : 1;
+ __IOM uint32_t IMASK6 : 1;
+ __IOM uint32_t IMASK7 : 1;
+ __IOM uint32_t IMASK8 : 1;
+ __IOM uint32_t IMASK9 : 1;
+ __IOM uint32_t IMASK10 : 1;
+ __IOM uint32_t IMASK11 : 1;
+ __IOM uint32_t IMASK12 : 1;
+ __IOM uint32_t IMASK13 : 1;
+ __IOM uint32_t IMASK14 : 1;
+ __IOM uint32_t IMASK15 : 1;
+ __IOM uint32_t IMASK16 : 1;
+ __IOM uint32_t IMASK17 : 1;
+ __IOM uint32_t IMASK18 : 1;
+ __IOM uint32_t IMASK19 : 1;
+ __IOM uint32_t IMASK20 : 1;
+ __IOM uint32_t IMASK21 : 1;
+ __IOM uint32_t IMASK22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } IMASK_B;
+ };
+
+ union
+ {
+ __IOM uint32_t EMASK;
+
+ struct
+ {
+ __IOM uint32_t EMASK0 : 1;
+ __IOM uint32_t EMASK1 : 1;
+ __IOM uint32_t EMASK2 : 1;
+ __IOM uint32_t EMASK3 : 1;
+ __IOM uint32_t EMASK4 : 1;
+ __IOM uint32_t EMASK5 : 1;
+ __IOM uint32_t EMASK6 : 1;
+ __IOM uint32_t EMASK7 : 1;
+ __IOM uint32_t EMASK8 : 1;
+ __IOM uint32_t EMASK9 : 1;
+ __IOM uint32_t EMASK10 : 1;
+ __IOM uint32_t EMASK11 : 1;
+ __IOM uint32_t EMASK12 : 1;
+ __IOM uint32_t EMASK13 : 1;
+ __IOM uint32_t EMASK14 : 1;
+ __IOM uint32_t EMASK15 : 1;
+ __IOM uint32_t EMASK16 : 1;
+ __IOM uint32_t EMASK17 : 1;
+ __IOM uint32_t EMASK18 : 1;
+ __IOM uint32_t EMASK19 : 1;
+ __IOM uint32_t EMASK20 : 1;
+ __IOM uint32_t EMASK21 : 1;
+ __IOM uint32_t EMASK22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } EMASK_B;
+ };
+
+ union
+ {
+ __IOM uint32_t RTEN;
+
+ struct
+ {
+ __IOM uint32_t RTEN0 : 1;
+ __IOM uint32_t RTEN1 : 1;
+ __IOM uint32_t RTEN2 : 1;
+ __IOM uint32_t RTEN3 : 1;
+ __IOM uint32_t RTEN4 : 1;
+ __IOM uint32_t RTEN5 : 1;
+ __IOM uint32_t RTEN6 : 1;
+ __IOM uint32_t RTEN7 : 1;
+ __IOM uint32_t RTEN8 : 1;
+ __IOM uint32_t RTEN9 : 1;
+ __IOM uint32_t RTEN10 : 1;
+ __IOM uint32_t RTEN11 : 1;
+ __IOM uint32_t RTEN12 : 1;
+ __IOM uint32_t RTEN13 : 1;
+ __IOM uint32_t RTEN14 : 1;
+ __IOM uint32_t RTEN15 : 1;
+ __IOM uint32_t RTEN16 : 1;
+ __IOM uint32_t RTEN17 : 1;
+ __IOM uint32_t RTEN18 : 1;
+ __IOM uint32_t RTEN19 : 1;
+ __IOM uint32_t RTEN20 : 1;
+ __IOM uint32_t RTEN21 : 1;
+ __IOM uint32_t RTEN22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } RTEN_B;
+ };
+
+ union
+ {
+ __IOM uint32_t FTEN;
+
+ struct
+ {
+ __IOM uint32_t FTEN0 : 1;
+ __IOM uint32_t FTEN1 : 1;
+ __IOM uint32_t FTEN2 : 1;
+ __IOM uint32_t FTEN3 : 1;
+ __IOM uint32_t FTEN4 : 1;
+ __IOM uint32_t FTEN5 : 1;
+ __IOM uint32_t FTEN6 : 1;
+ __IOM uint32_t FTEN7 : 1;
+ __IOM uint32_t FTEN8 : 1;
+ __IOM uint32_t FTEN9 : 1;
+ __IOM uint32_t FTEN10 : 1;
+ __IOM uint32_t FTEN11 : 1;
+ __IOM uint32_t FTEN12 : 1;
+ __IOM uint32_t FTEN13 : 1;
+ __IOM uint32_t FTEN14 : 1;
+ __IOM uint32_t FTEN15 : 1;
+ __IOM uint32_t FTEN16 : 1;
+ __IOM uint32_t FTEN17 : 1;
+ __IOM uint32_t FTEN18 : 1;
+ __IOM uint32_t FTEN19 : 1;
+ __IOM uint32_t FTEN20 : 1;
+ __IOM uint32_t FTEN21 : 1;
+ __IOM uint32_t FTEN22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } FTEN_B;
+ };
+
+ union
+ {
+ __IOM uint32_t SWINTE;
+
+ struct
+ {
+ __IOM uint32_t SWINTE0 : 1;
+ __IOM uint32_t SWINTE1 : 1;
+ __IOM uint32_t SWINTE2 : 1;
+ __IOM uint32_t SWINTE3 : 1;
+ __IOM uint32_t SWINTE4 : 1;
+ __IOM uint32_t SWINTE5 : 1;
+ __IOM uint32_t SWINTE6 : 1;
+ __IOM uint32_t SWINTE7 : 1;
+ __IOM uint32_t SWINTE8 : 1;
+ __IOM uint32_t SWINTE9 : 1;
+ __IOM uint32_t SWINTE10 : 1;
+ __IOM uint32_t SWINTE11 : 1;
+ __IOM uint32_t SWINTE12 : 1;
+ __IOM uint32_t SWINTE13 : 1;
+ __IOM uint32_t SWINTE14 : 1;
+ __IOM uint32_t SWINTE15 : 1;
+ __IOM uint32_t SWINTE16 : 1;
+ __IOM uint32_t SWINTE17 : 1;
+ __IOM uint32_t SWINTE18 : 1;
+ __IOM uint32_t SWINTE19 : 1;
+ __IOM uint32_t SWINTE20 : 1;
+ __IOM uint32_t SWINTE21 : 1;
+ __IOM uint32_t SWINTE22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } SWINTE_B;
+ };
+
+ union
+ {
+ __IOM uint32_t IPEND;
+
+ struct
+ {
+ __IOM uint32_t IPEND0 : 1;
+ __IOM uint32_t IPEND1 : 1;
+ __IOM uint32_t IPEND2 : 1;
+ __IOM uint32_t IPEND3 : 1;
+ __IOM uint32_t IPEND4 : 1;
+ __IOM uint32_t IPEND5 : 1;
+ __IOM uint32_t IPEND6 : 1;
+ __IOM uint32_t IPEND7 : 1;
+ __IOM uint32_t IPEND8 : 1;
+ __IOM uint32_t IPEND9 : 1;
+ __IOM uint32_t IPEND10 : 1;
+ __IOM uint32_t IPEND11 : 1;
+ __IOM uint32_t IPEND12 : 1;
+ __IOM uint32_t IPEND13 : 1;
+ __IOM uint32_t IPEND14 : 1;
+ __IOM uint32_t IPEND15 : 1;
+ __IOM uint32_t IPEND16 : 1;
+ __IOM uint32_t IPEND17 : 1;
+ __IOM uint32_t IPEND18 : 1;
+ __IOM uint32_t IPEND19 : 1;
+ __IOM uint32_t IPEND20 : 1;
+ __IOM uint32_t IPEND21 : 1;
+ __IOM uint32_t IPEND22 : 1;
+ __IM uint32_t RESERVED : 9;
+ } IPEND_B;
+ };
+} EINT_T;
+
+/**
+ * @brief Independent WATCHDOG (IWDT)
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t KEY;
+
+ struct
+ {
+ __OM uint32_t KEY : 16;
+ __IM uint32_t RESERVED : 16;
+ } KEY_B;
+ };
+
+ union
+ {
+ __IOM uint32_t PSC;
+
+ struct
+ {
+ __IOM uint32_t PSC : 3;
+ __IM uint32_t RESERVED : 29;
+ } PSC_B;
+ };
+
+ union
+ {
+ __IOM uint32_t CNTRLD;
+
+ struct
+ {
+ __IOM uint32_t CNTRLD : 12;
+ __IM uint32_t RESERVED : 20;
+ } CNTRLD_B;
+ };
+
+ union
+ {
+ __IM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t PSCUFLG : 1;
+ __IM uint32_t CNTUFLG : 1;
+ __IM uint32_t RESERVED : 30;
+ } STS_B;
+ };
+} IWDT_T;
+
+/**
+ * @brief Window WATCHDOG (WWDT)
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t CNT : 7;
+ __IOM uint32_t WWDTEN : 1;
+ __IM uint32_t RESERVED : 24;
+ } CTRL_B;
+ };
+
+ union
+ {
+ __IOM uint32_t CFG;
+
+ struct
+ {
+ __IOM uint32_t WIN : 7;
+ __IOM uint32_t TBPSC : 2;
+ __IOM uint32_t EWIEN : 1;
+ __IM uint32_t RESERVED : 22;
+ } CFG_B;
+ };
+
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t EWIFLG : 1;
+ __IM uint32_t RESERVED : 31;
+ } STS_B;
+ };
+} WWDT_T;
+
+/**
+ * @brief Serial peripheral interface(SPI)
+ */
+typedef struct
+{
+ /** Control register 1 */
+ union
+ {
+ __IOM uint32_t CTRL1;
+
+ struct
+ {
+ __IOM uint32_t CPHA : 1;
+ __IOM uint32_t CPOL : 1;
+ __IOM uint32_t MSMCFG : 1;
+ __IOM uint32_t BRSEL : 3;
+ __IOM uint32_t SPIEN : 1;
+ __IOM uint32_t LSBSEL : 1;
+ __IOM uint32_t ISSEL : 1;
+ __IOM uint32_t SSEN : 1;
+ __IOM uint32_t RXOMEN : 1;
+ __IOM uint32_t DFLSEL : 1;
+ __IOM uint32_t CRCNXT : 1;
+ __IOM uint32_t CRCEN : 1;
+ __IOM uint32_t BMOEN : 1;
+ __IOM uint32_t BMEN : 1;
+ __IM uint32_t RESERVED : 16;
+ } CTRL1_B;
+ };
+
+ /** Control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IOM uint32_t RXDEN : 1;
+ __IOM uint32_t TXDEN : 1;
+ __IOM uint32_t SSOEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t FRFCFG : 1;
+ __IOM uint32_t ERRIEN : 1;
+ __IOM uint32_t RXBNEIEN : 1;
+ __IOM uint32_t TXBEIEN : 1;
+ __IM uint32_t RESERVED2 : 24;
+ } CTRL2_B;
+ };
+
+ /** Status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t RXBNEFLG : 1;
+ __IM uint32_t TXBEFLG : 1;
+ __IM uint32_t SCHDIR : 1;
+ __IM uint32_t UDRFLG : 1;
+ __IOM uint32_t CRCEFLG : 1;
+ __IM uint32_t MEFLG : 1;
+ __IM uint32_t OVRFLG : 1;
+ __IM uint32_t BSYFLG : 1;
+ __IM uint32_t FFERR : 1;
+ __IM uint32_t RESERVED : 23;
+ } STS_B;
+ };
+
+ /** Data register */
+ union
+ {
+ __IOM uint32_t DATA;
+
+ struct
+ {
+ __IOM uint32_t DATA : 16;
+ __IM uint32_t RESERVED : 16;
+ } DATA_B;
+ };
+
+ /** CRC polynomial register */
+ union
+ {
+ __IOM uint32_t CRCPOLY;
+
+ struct
+ {
+ __IOM uint32_t CRCPOLY : 16;
+ __IM uint32_t RESERVED : 16;
+ } CRCPOLY_B;
+ };
+
+ /** Receive CRC register */
+ union
+ {
+ __IM uint32_t RXCRC;
+
+ struct
+ {
+ __IM uint32_t RXCRC : 16;
+ __IM uint32_t RESERVED : 16;
+ } RXCRC_B;
+ };
+
+ /** Transmit CRC register */
+ union
+ {
+ __IM uint32_t TXCRC;
+
+ struct
+ {
+ __IM uint32_t TXCRC : 16;
+ __IM uint32_t RESERVED : 16;
+ } TXCRC_B;
+ };
+
+ /** Transmit I2S CTRL register */
+ union
+ {
+ __IOM uint32_t I2SCFG;
+
+ struct
+ {
+ __IOM uint32_t CHLEN : 1;
+ __IOM uint32_t DATLEN : 2;
+ __IOM uint32_t CPOL : 1;
+ __IOM uint32_t I2SSSEL : 2;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t PFSSEL : 1;
+ __IOM uint32_t I2SMOD : 2;
+ __IOM uint32_t I2SEN : 1;
+ __IOM uint32_t MODESEL : 1;
+ __IM uint32_t RESERVED2 : 20;
+ } I2SCFG_B;
+ };
+
+ /** Transmit I2S DIV register */
+ union
+ {
+ __IOM uint32_t I2SPSC;
+
+ struct
+ {
+ __IOM uint32_t I2SPSC : 8;
+ __IOM uint32_t ODDPSC : 1;
+ __IOM uint32_t MCOEN : 1;
+ __IM uint32_t RESERVED : 22;
+ } I2SPSC_B;
+ };
+} SPI_T;
+
+/**
+ * @brief Secure digital input/output interface (SDIO)
+ */
+typedef struct
+{
+ /** Power control register */
+ union
+ {
+ __IOM uint32_t PWRCTRL;
+
+ struct
+ {
+ __IOM uint32_t PWRCTRL : 2;
+ __IM uint32_t RESERVED : 30;
+ } PWRCTRL_B;
+ };
+
+ /** Clock control register */
+ union
+ {
+ __IOM uint32_t CLKCTRL;
+
+ struct
+ {
+ __IOM uint32_t CLKDIV : 8;
+ __IOM uint32_t CLKEN : 1;
+ __IOM uint32_t PWRSAV : 1;
+ __IOM uint32_t BYPASSEN : 1;
+ __IOM uint32_t WBSEL : 2;
+ __IOM uint32_t DEPSEL : 1;
+ __IOM uint32_t HFCEN : 1;
+ __IM uint32_t RESERVED : 17;
+ } CLKCTRL_B;
+ };
+
+ /** Argument register */
+ union
+ {
+ __IOM uint32_t ARG;
+
+ struct
+ {
+ __IOM uint32_t CMDARG : 32;
+ } ARG_B;
+ };
+
+ /** Command register */
+ union
+ {
+ __IOM uint32_t CMD;
+
+ struct
+ {
+ __IOM uint32_t CMDINDEX : 6;
+ __IOM uint32_t WAITRES : 2;
+ __IOM uint32_t WAITINT : 1;
+ __IOM uint32_t WENDDATA : 1;
+ __IOM uint32_t CPSMEN : 1;
+ __IOM uint32_t SDIOSC : 1;
+ __IOM uint32_t CMDCPEN : 1;
+ __IOM uint32_t INTEN : 1;
+ __IOM uint32_t ATACMD : 1;
+ __IM uint32_t RESERVED : 17;
+ } CMD_B;
+ };
+
+ /** Command response register */
+ union
+ {
+ __IM uint32_t CMDRES;
+
+ struct
+ {
+ __IM uint32_t CMDRES : 6;
+ __IM uint32_t RESERVED : 26;
+ } CMDRES_B;
+ };
+
+ /** SDIO response register1 */
+ union
+ {
+ __IM uint32_t RES1;
+
+ struct
+ {
+ __IM uint32_t CARDSTS1 : 32;
+ } RES1_B;
+ };
+
+ /** SDIO response register2 */
+ union
+ {
+ __IM uint32_t RES2;
+
+ struct
+ {
+ __IM uint32_t CARDSTS2 : 32;
+ } RES2_B;
+ };
+
+ /** SDIO response register3 */
+ union
+ {
+ __IM uint32_t RES3;
+
+ struct
+ {
+ __IM uint32_t CARDSTS3 : 32;
+ } RES3_B;
+ };
+
+ /** SDIO response register4 */
+ union
+ {
+ __IM uint32_t RES4;
+
+ struct
+ {
+ __IM uint32_t CARDSTS4 : 32;
+ } RES4_B;
+ };
+
+ /** Data timer register */
+ union
+ {
+ __IOM uint32_t DATATIME;
+
+ struct
+ {
+ __IOM uint32_t DATATIME : 32;
+ } DTMR_B;
+ };
+
+ /** Data length register */
+ union
+ {
+ __IOM uint32_t DATALEN;
+
+ struct
+ {
+ __IOM uint32_t DATALEN : 25;
+ __IM uint32_t RESERVED : 7;
+ } DLEN_B;
+ };
+
+ /** Data control register */
+ union
+ {
+ __IOM uint32_t DCTRL;
+
+ struct
+ {
+ __IOM uint32_t DTEN : 1;
+ __IOM uint32_t DTDRCFG : 1;
+ __IOM uint32_t DTSEL : 1;
+ __IOM uint32_t DMAEN : 1;
+ __IOM uint32_t DBSIZE : 4;
+ __IOM uint32_t RWSTR : 1;
+ __IOM uint32_t RWSTOP : 1;
+ __IOM uint32_t RDWAIT : 1;
+ __IOM uint32_t SDIOF : 1;
+ __IM uint32_t RESERVED : 20;
+ } DCTRL_B;
+ };
+
+ /** Data count register */
+ union
+ {
+ __IM uint32_t DCNT;
+
+ struct
+ {
+ __IM uint32_t DATACNT : 25;
+ __IM uint32_t RESERVED : 7;
+ } DCNT_B;
+ };
+
+ /** SDIO status register */
+ union
+ {
+ __IM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t COMRESP : 1;
+ __IM uint32_t DBDR : 1;
+ __IM uint32_t CMDRESTO : 1;
+ __IM uint32_t DATATO : 1;
+ __IM uint32_t TXUDRER : 1;
+ __IM uint32_t RXOVRER : 1;
+ __IM uint32_t CMDRES : 1;
+ __IM uint32_t CMDSENT : 1;
+ __IM uint32_t DATAEND : 1;
+ __IM uint32_t SBE : 1;
+ __IM uint32_t DBCP : 1;
+ __IM uint32_t CMDACT : 1;
+ __IM uint32_t TXACT : 1;
+ __IM uint32_t RXACT : 1;
+ __IM uint32_t TXFHF : 1;
+ __IM uint32_t RXFHF : 1;
+ __IM uint32_t TXFF : 1;
+ __IM uint32_t RXFF : 1;
+ __IM uint32_t TXFE : 1;
+ __IM uint32_t RXFE : 1;
+ __IM uint32_t TXDA : 1;
+ __IM uint32_t RXDA : 1;
+ __IM uint32_t SDIOINT : 1;
+ __IM uint32_t ATAEND : 1;
+ __IM uint32_t RESERVED : 8;
+ } STS_B;
+ };
+
+ /** SDIO interrupt clear register */
+ union
+ {
+ __IOM uint32_t ICF;
+
+ struct
+ {
+ __IOM uint32_t DBCE : 1;
+ __IOM uint32_t CRCE : 1;
+ __IOM uint32_t CRTO : 1;
+ __IOM uint32_t DTO : 1;
+ __IOM uint32_t TXFUE : 1;
+ __IOM uint32_t RXFOE : 1;
+ __IOM uint32_t CMDRES : 1;
+ __IOM uint32_t CMDSENT : 1;
+ __IOM uint32_t DATAEND : 1;
+ __IOM uint32_t SBE : 1;
+ __IOM uint32_t DBCP : 1;
+ __IM uint32_t RESERVED1 : 11;
+ __IOM uint32_t SDIOIT : 1;
+ __IOM uint32_t ATAEND : 1;
+ __IM uint32_t RESERVED2 : 8;
+ } ICF_B;
+ };
+
+ /** SDIO interrupt mask register */
+ union
+ {
+ __IOM uint32_t MASK;
+
+ struct
+ {
+ __IOM uint32_t CCRCFAIL : 1;
+ __IOM uint32_t DCRCFAIL : 1;
+ __IOM uint32_t CMDTO : 1;
+ __IOM uint32_t DATATO : 1;
+ __IOM uint32_t TXURER : 1;
+ __IOM uint32_t RXORER : 1;
+ __IOM uint32_t CMDRESRC : 1;
+ __IOM uint32_t CMDSENT : 1;
+ __IOM uint32_t DATAEND : 1;
+ __IOM uint32_t STRTER : 1;
+ __IOM uint32_t DBEND : 1;
+ __IOM uint32_t CMDACT : 1;
+ __IOM uint32_t TXACT : 1;
+ __IOM uint32_t RXACT : 1;
+ __IOM uint32_t TXHFERT : 1;
+ __IOM uint32_t RXHFFUL : 1;
+ __IOM uint32_t TXFUL : 1;
+ __IOM uint32_t RXFUL : 1;
+ __IOM uint32_t TXEPT : 1;
+ __IOM uint32_t RXFEIE : 1;
+ __IOM uint32_t TXDAVB : 1;
+ __IOM uint32_t RXDAVB : 1;
+ __IOM uint32_t SDIOINTREC : 1;
+ __IOM uint32_t ATACLPREC : 1;
+ __IM uint32_t RESERVED : 8;
+ } MASK_B;
+ };
+
+ __IM uint32_t RESERVED[2];
+
+ /** SDIO FIFO count register */
+ union
+ {
+ __IM uint32_t FIFOCNT;
+
+ struct
+ {
+ __IM uint32_t FIFOCNT : 24;
+ __IM uint32_t RESERVED : 8;
+ } FIFOCNT_B;
+ };
+
+ __IM uint32_t RESERVED1[13];
+
+ /** SDIO data FIFO register */
+ union
+ {
+ __IOM uint32_t FIFODATA;
+
+ struct
+ {
+ __IOM uint32_t FIFODATA : 32;
+ } FIFODATA_B;
+ };
+} SDIO_T;
+
+/**
+ * @brief Digital to Analog Converter(DAC)
+ */
+typedef struct
+{
+ /** Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t ENCH1 : 1;
+ __IOM uint32_t BUFFDCH1 : 1;
+ __IOM uint32_t TRGENCH1 : 1;
+ __IOM uint32_t TRGSELCH1 : 3;
+ __IOM uint32_t WAVENCH1 : 2;
+ __IOM uint32_t MAMPSELCH1 : 4;
+ __IOM uint32_t DMAENCH1 : 1;
+ __IOM uint32_t DMAUDIEN1 : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t ENCH2 : 1;
+ __IOM uint32_t BUFFDCH2 : 1;
+ __IOM uint32_t TRGENCH2 : 1;
+ __IOM uint32_t TRGSELCH2 : 3;
+ __IOM uint32_t WAVENCH2 : 2;
+ __IOM uint32_t MAMPSELCH2 : 4;
+ __IOM uint32_t DMAENCH2 : 1;
+ __IOM uint32_t DMAUDIEN2 : 1;
+ __IM uint32_t RESERVED2 : 2;
+ } CTRL_B;
+ };
+
+ /** Software trigger register */
+ union
+ {
+ __OM uint32_t SWTRG;
+
+ struct
+ {
+ __OM uint32_t SWTRG1 : 1;
+ __OM uint32_t SWTRG2 : 1;
+ __IM uint32_t RESERVED : 30;
+ } SWTRG_B;
+ };
+
+ /** Channel1 12-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH12R1;
+
+ struct
+ {
+ __IOM uint32_t DATA : 12;
+ __IM uint32_t RESERVED : 20;
+ } DH12R1_B;
+ };
+
+ /** Channel1 12-bit left-aligned register */
+ union
+ {
+ __IOM uint32_t DH12L1;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t DATA : 12;
+ __IM uint32_t RESERVED2 : 16;
+ } DH12L1_B;
+ };
+
+ /** Channel1 8-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH8R1;
+
+ struct
+ {
+ __IOM uint32_t DATA : 8;
+ __IM uint32_t RESERVED : 24;
+ } DH8R1_B;
+ };
+
+ /** Channel2 12-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH12R2;
+
+ struct
+ {
+ __IOM uint32_t DATA : 12;
+ __IM uint32_t RESERVED : 20;
+ } DH12R2_B;
+ };
+
+ /** Channel2 12-bit left-aligned register */
+ union
+ {
+ __IOM uint32_t DH12L2;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t DATA : 12;
+ __IM uint32_t RESERVED2 : 16;
+ } DH12L2_B;
+ };
+
+ /** Channel2 8-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH8R2;
+
+ struct
+ {
+ __IOM uint32_t DATA : 8;
+ __IM uint32_t RESERVED : 24;
+ } DH8R2_B;
+ };
+
+ /** Channel1,Channel2 12-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH12RDUAL;
+
+ struct
+ {
+ __IOM uint32_t DATACH1 : 12;
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t DATACH2 : 12;
+ __IM uint32_t RESERVED2 : 4;
+ } DH12RDUAL_B;
+ };
+
+ /** Channel1,Channel2 12-bit left-aligned register */
+ union
+ {
+ __IOM uint32_t DH12LDUAL;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 4;
+ __IOM uint32_t DATACH1 : 12;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t DATACH2 : 12;
+ } DH12LDUAL_B;
+ };
+
+ /** Channel1,Channel2 8-bit right-aligned register */
+ union
+ {
+ __IOM uint32_t DH8RDUAL;
+
+ struct
+ {
+ __IOM uint32_t DATACH1 : 8;
+ __IOM uint32_t DATACH2 : 8;
+ __IM uint32_t RESERVED : 16;
+ } DH8RDUAL_B;
+ };
+
+ /** Channel1 data output register */
+ union
+ {
+ __IM uint32_t DATAOCH1;
+
+ struct
+ {
+ __IM uint32_t DATA : 12;
+ __IM uint32_t RESERVED : 20;
+ } DATAOCH1_B;
+ };
+
+ /** Channel2 data output register */
+ union
+ {
+ __IM uint32_t DATAOCH2;
+
+ struct
+ {
+ __IM uint32_t DATA : 12;
+ __IM uint32_t RESERVED : 20;
+ } DATAOCH2_B;
+ };
+
+ /** DAC status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 13;
+ __IOM uint32_t DMAUDFLG1 : 1;
+ __IM uint32_t RESERVED2 : 15;
+ __IOM uint32_t DMAUDFLG2 : 1;
+ __IM uint32_t RESERVED3 : 2;
+ } STS_B;
+ };
+} DAC_T;
+
+/**
+ * @brief Static Memory Controller (SMC) Bank1
+ */
+typedef struct
+{
+ /** SRAM/NOR-Flash chip-select control register 1 */
+ union
+ {
+ __IOM uint32_t CSCTRL1;
+
+ struct
+ {
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t ADMUXEN : 1;
+ __IOM uint32_t MTYPECFG : 2;
+ __IOM uint32_t MDBWIDCFG : 2;
+ __IOM uint32_t NORFMACCEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t BURSTEN : 1;
+ __IOM uint32_t WSPOLCFG : 1;
+ __IOM uint32_t WRAPBEN : 1;
+ __IOM uint32_t WTIMCFG : 1;
+ __IOM uint32_t WREN : 1;
+ __IOM uint32_t WAITEN : 1;
+ __IOM uint32_t EXTMODEEN : 1;
+ __IOM uint32_t WSASYNCEN : 1;
+ __IOM uint32_t CRAMPSIZECFG : 3;
+ __IOM uint32_t WRBURSTEN : 1;
+ __IM uint32_t RESERVED2 : 12;
+ } CSCTRL1_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select timing register 1 */
+ union
+ {
+ __IOM uint32_t CSTIM1;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } CSTIM1_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select control register 2 */
+ union
+ {
+ __IOM uint32_t CSCTRL2;
+
+ struct
+ {
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t ADMUXEN : 1;
+ __IOM uint32_t MTYPECFG : 2;
+ __IOM uint32_t MDBWIDCFG : 2;
+ __IOM uint32_t NORFMACCEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t BURSTEN : 1;
+ __IOM uint32_t WSPOLCFG : 1;
+ __IOM uint32_t WRAPBEN : 1;
+ __IOM uint32_t WTIMCFG : 1;
+ __IOM uint32_t WREN : 1;
+ __IOM uint32_t WAITEN : 1;
+ __IOM uint32_t EXTMODEEN : 1;
+ __IOM uint32_t WSASYNCEN : 1;
+ __IOM uint32_t CRAMPSIZECFG : 3;
+ __IOM uint32_t WRBURSTEN : 1;
+ __IM uint32_t RESERVED2 : 12;
+ } CSCTRL2_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select timing register 2 */
+ union
+ {
+ __IOM uint32_t CSTIM2;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } CSTIM2_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select control register 3 */
+ union
+ {
+ __IOM uint32_t CSCTRL3;
+
+ struct
+ {
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t ADMUXEN : 1;
+ __IOM uint32_t MTYPECFG : 2;
+ __IOM uint32_t MDBWIDCFG : 2;
+ __IOM uint32_t NORFMACCEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t BURSTEN : 1;
+ __IOM uint32_t WSPOLCFG : 1;
+ __IOM uint32_t WRAPBEN : 1;
+ __IOM uint32_t WTIMCFG : 1;
+ __IOM uint32_t WREN : 1;
+ __IOM uint32_t WAITEN : 1;
+ __IOM uint32_t EXTMODEEN : 1;
+ __IOM uint32_t WSASYNCEN : 1;
+ __IOM uint32_t CRAMPSIZECFG : 3;
+ __IOM uint32_t WRBURSTEN : 1;
+ __IM uint32_t RESERVED2 : 12;
+ } CSCTRL3_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select timing register 3 */
+ union
+ {
+ __IOM uint32_t CSTIM3;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } CSTIM3_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select control register 4 */
+ union
+ {
+ __IOM uint32_t CSCTRL4;
+
+ struct
+ {
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t ADMUXEN : 1;
+ __IOM uint32_t MTYPECFG : 2;
+ __IOM uint32_t MDBWIDCFG : 2;
+ __IOM uint32_t NORFMACCEN : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t BURSTEN : 1;
+ __IOM uint32_t WSPOLCFG : 1;
+ __IOM uint32_t WRAPBEN : 1;
+ __IOM uint32_t WTIMCFG : 1;
+ __IOM uint32_t WREN : 1;
+ __IOM uint32_t WAITEN : 1;
+ __IOM uint32_t EXTMODEEN : 1;
+ __IOM uint32_t WSASYNCEN : 1;
+ __IOM uint32_t CRAMPSIZECFG : 3;
+ __IOM uint32_t WRBURSTEN : 1;
+ __IM uint32_t RESERVED2 : 12;
+ } CSCTRL4_B;
+ };
+
+ /** SRAM/NOR-Flash chip-select timing register 4 */
+ union
+ {
+ __IOM uint32_t CSTIM4;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } CSTIM4_B;
+ };
+} SMC_Bank1_T;
+
+/**
+ * @brief Static Memory Controller (SMC) Bank1E
+ */
+typedef struct
+{
+ /** SRAM/NOR-Flash write timing registers 1 */
+ union
+ {
+ __IOM uint32_t WRTTIM1;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } WRTTIM1_B;
+ };
+
+ __IM uint32_t RESERVED;
+
+ /** SRAM/NOR-Flash write timing registers 2 */
+ union
+ {
+ __IOM uint32_t WRTTIM2;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } WRTTIM2_B;
+ };
+
+ __IM uint32_t RESERVED1;
+
+ /** SRAM/NOR-Flash write timing registers 3 */
+ union
+ {
+ __IOM uint32_t WRTTIM3;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } WRTTIM3_B;
+ };
+
+ __IOM uint32_t RESERVED2;
+
+ /** SRAM/NOR-Flash write timing registers 4 */
+ union
+ {
+ __IOM uint32_t WRTTIM4;
+
+ struct
+ {
+ __IOM uint32_t ADDRSETCFG : 4;
+ __IOM uint32_t ADDRHLDCFG : 4;
+ __IOM uint32_t DATASETCFG : 8;
+ __IOM uint32_t BUSTURNCFG : 4;
+ __IOM uint32_t CLKDIVCFG : 4;
+ __IOM uint32_t DATALATCFG : 4;
+ __IOM uint32_t ASYNCACCCFG : 2;
+ __IM uint32_t RESERVED : 2;
+ } WRTTIM4_B;
+ };
+} SMC_Bank1E_T;
+
+/**
+ * @brief Static Memory Controller (SMC) Bank 2
+ */
+typedef struct
+{
+ /** PC Card/NAND Flash control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t WAITFEN : 1;
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t MTYPECFG : 1;
+ __IOM uint32_t DBWIDCFG : 2;
+ __IOM uint32_t ECCEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t C2RDCFG : 4;
+ __IOM uint32_t A2RDCFG : 4;
+ __IOM uint32_t ECCPSCFG : 3;
+ __IM uint32_t RESERVED3 : 12;
+ } CTRL2_B;
+ };
+
+ /** FIFO status and interrupt register 2 */
+ union
+ {
+ __IOM uint32_t STSINT2;
+
+ struct
+ {
+ __IOM uint32_t IREFLG : 1;
+ __IOM uint32_t IHLFLG : 1;
+ __IOM uint32_t IFEFLG : 1;
+ __IOM uint32_t IREDEN : 1;
+ __IOM uint32_t IHLDEN : 1;
+ __IOM uint32_t IFEDEN : 1;
+ __IM uint32_t FEFLG : 1;
+ __IM uint32_t RESERVED : 16;
+ } STSINT2_B;
+ };
+ /** Common memory space timing register 2 */
+ union
+ {
+ __IOM uint32_t CMSTIM2;
+
+ struct
+ {
+ __IOM uint32_t SET2 : 8;
+ __IOM uint32_t WAIT2 : 8;
+ __IOM uint32_t HLD2 : 8;
+ __IOM uint32_t HIZ2 : 8;
+ } CMSTIM2_B;
+ };
+
+ /** Attribute memory space timing register 2 */
+ union
+ {
+ __IOM uint32_t AMSTIM2;
+
+ struct
+ {
+ __IOM uint32_t SET2 : 8;
+ __IOM uint32_t WAIT2 : 8;
+ __IOM uint32_t HLD2 : 8;
+ __IOM uint32_t HIZ2 : 8;
+ } AMSTIM2_B;
+ };
+
+ __IM uint32_t RESERVED;
+
+ /** ECC result register 2 */
+ union
+ {
+ __IM uint32_t ECCRS2;
+
+ struct
+ {
+ __IM uint32_t ECCRS2 : 32;
+ } ECCRS2_B;
+ };
+} SMC_Bank2_T;
+
+/**
+ * @brief Flexible Static Memory Controller (SMC) Bank 3
+ */
+typedef struct
+{
+ /** PC Card/NAND Flash control register 3 */
+ union
+ {
+ __IOM uint32_t CTRL3;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t WAITFEN : 1;
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t MTYPECFG : 1;
+ __IOM uint32_t DBWIDCFG : 2;
+ __IOM uint32_t ECCEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t C2RDCFG : 4;
+ __IOM uint32_t A2RDCFG : 4;
+ __IOM uint32_t ECCPSCFG : 3;
+ __IM uint32_t RESERVED3 : 12;
+ } CTRL3_B;
+ };
+
+ /** FIFO status and interrupt register 3 */
+ union
+ {
+ __IOM uint32_t STSINT3;
+
+ struct
+ {
+ __IOM uint32_t IREFLG : 1;
+ __IOM uint32_t IHLFLG : 1;
+ __IOM uint32_t IFEFLG : 1;
+ __IOM uint32_t IREDEN : 1;
+ __IOM uint32_t IHLDEN : 1;
+ __IOM uint32_t IFEDEN : 1;
+ __IM uint32_t FEFLG : 1;
+ __IM uint32_t RESERVED : 16;
+ } STSINT3_B;
+ };
+
+ /** Common memory space timing register 3 */
+ union
+ {
+ __IOM uint32_t CMSTIM3;
+
+ struct
+ {
+ __IOM uint32_t SET3 : 8;
+ __IOM uint32_t WAIT3 : 8;
+ __IOM uint32_t HLD3 : 8;
+ __IOM uint32_t HIZ3 : 8;
+ } CMSTIM3_B;
+ };
+
+ /** Attribute memory space timing register 3 */
+ union
+ {
+ __IOM uint32_t AMSTIM3;
+
+ struct
+ {
+ __IOM uint32_t SET3 : 8;
+ __IOM uint32_t WAIT3 : 8;
+ __IOM uint32_t HLD3 : 8;
+ __IOM uint32_t HIZ3 : 8;
+ } AMSTIM3_B;
+ };
+
+ __IM uint32_t RESERVED;
+
+ /** ECC result register 3 */
+ union
+ {
+ __IM uint32_t ECCRS3;
+
+ struct
+ {
+ __IM uint32_t ECCRS3 : 32;
+ } ECCRS3_B;
+ };
+} SMC_Bank3_T;
+
+/**
+ * @brief Flexible Static Memory Controller (SMC) Bank 4
+ */
+typedef struct
+{
+ /** PC Card/NAND Flash control register 4 */
+ union
+ {
+ __IOM uint32_t CTRL4;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t WAITFEN : 1;
+ __IOM uint32_t MBKEN : 1;
+ __IOM uint32_t MTYPECFG : 1;
+ __IOM uint32_t DBWIDCFG : 2;
+ __IOM uint32_t ECCEN : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t C2RDCFG : 4;
+ __IOM uint32_t A2RDCFG : 4;
+ __IOM uint32_t ECCPSCFG : 3;
+ __IM uint32_t RESERVED3 : 12;
+ } CTRL4_B;
+ };
+
+ /** FIFO status and interrupt register 4 */
+ union
+ {
+ __IOM uint32_t STSINT4;
+
+ struct
+ {
+ __IOM uint32_t IREFLG : 1;
+ __IOM uint32_t IHLFLG : 1;
+ __IOM uint32_t IFEFLG : 1;
+ __IOM uint32_t IREDEN : 1;
+ __IOM uint32_t IHLDEN : 1;
+ __IOM uint32_t IFEDEN : 1;
+ __IM uint32_t FEFLG : 1;
+ __IM uint32_t RESERVED : 16;
+ } STSINT4_B;
+ };
+
+ /** Common memory space timing register 4 */
+ union
+ {
+ __IOM uint32_t CMSTIM4;
+
+ struct
+ {
+ __IOM uint32_t SET4 : 8;
+ __IOM uint32_t WAIT4 : 8;
+ __IOM uint32_t HLD4 : 8;
+ __IOM uint32_t HIZ4 : 8;
+ } CMSTIM4_B;
+ };
+
+ /** Attribute memory space timing register 4 */
+ union
+ {
+ __IOM uint32_t AMSTIM4;
+
+ struct
+ {
+ __IOM uint32_t SET4 : 8;
+ __IOM uint32_t WAIT4 : 8;
+ __IOM uint32_t HLD4 : 8;
+ __IOM uint32_t HIZ4 : 8;
+ } AMSTIM4_B;
+ };
+
+ /** I/O space timing register 4 */
+ union
+ {
+ __IOM uint32_t IOSTIM4;
+
+ struct
+ {
+ __IOM uint32_t SET : 8;
+ __IOM uint32_t WAIT : 8;
+ __IOM uint32_t HLD : 8;
+ __IOM uint32_t HIZ : 8;
+ } IOSTIM4_B;
+ };
+} SMC_Bank4_T;
+
+/**
+ * @brief Dynamic memory controler (DMC)
+ */
+typedef struct
+{
+ /** @brief Configuraion register */
+ union
+ {
+ __IOM uint32_t CFG;
+ struct
+ {
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t BAWCFG : 2;
+ __IOM uint32_t RAWCFG : 4;
+ __IOM uint32_t CAWCFG : 4;
+ __IOM uint32_t DWCFG : 2;
+ __IM uint32_t RESERVED2 : 17;
+ } CFG_B;
+ };
+
+ /** @brief Timing register 0 */
+ union
+ {
+ __IOM uint32_t TIM0;
+ struct
+ {
+ __IOM uint32_t CASLSEL0 : 2;
+ __IOM uint32_t RASMINTSEL : 4;
+ __IOM uint32_t DTIMSEL : 3;
+ __IOM uint32_t PCPSEL : 3;
+ __IOM uint32_t WRTIMSEL : 2;
+ __IOM uint32_t ARPSEL : 4;
+ __IOM uint32_t XSR0 : 4;
+ __IOM uint32_t ATACP : 4;
+ __IOM uint32_t ECASLSEL1 : 1;
+ __IOM uint32_t EXSR1 : 5;
+ } TIM0_B;
+ };
+
+ /** @brief Timing register 1 */
+ union
+ {
+ __IOM uint32_t TIM1;
+ struct
+ {
+ __IOM uint32_t STBTIM : 16;
+ __IOM uint32_t ARNUMCFG : 4;
+ __IM uint32_t RESERVED : 12;
+ } TIM1_B;
+ };
+
+ /** @brief Control register 1 */
+ union
+ {
+ __IOM uint32_t CTRL1;
+ struct
+ {
+ __IOM uint32_t INIT : 1;
+ __IOM uint32_t SRMEN : 1;
+ __IOM uint32_t PDMEN : 1;
+ __IOM uint32_t PCACFG : 1;
+ __IOM uint32_t FRBSREN : 1;
+ __IOM uint32_t FRASREN : 1;
+ __IOM uint32_t RDNUMMCFG : 3;
+ __IOM uint32_t MODESET : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t SRMFLG : 1;
+ __IOM uint32_t BANKNUMCFG : 4;
+ __IM uint32_t RESERVED2 : 16;
+ } CTRL1_B;
+ };
+
+ /** @brief Refresh register */
+ union
+ {
+ __IOM uint32_t REF;
+ struct
+ {
+ __IOM uint32_t RCYCCFG : 16;
+ __IM uint32_t RESERVED : 16;
+ } REF_B;
+ };
+
+ __IM uint32_t RESERVED[251];
+
+ /** @brief Switch register */
+ union
+ {
+ __IOM uint32_t SW;
+ struct
+ {
+ __IOM uint32_t MCSW : 1;
+ __IM uint32_t RESERVED : 31;
+ } SW_B;
+ };
+
+ /** @brief Control register 2 */
+ union
+ {
+ __IOM uint32_t CTRL2;
+ struct
+ {
+ __IOM uint32_t CPHACFG : 1;
+ __IOM uint32_t RDDEN : 1;
+ __IOM uint32_t RDDCFG : 3;
+ __IOM uint32_t WPEN : 1;
+ __IOM uint32_t BUFFEN : 1;
+ __IOM uint32_t WRPBSEL : 1;
+ __IM uint32_t RESERVED : 24;
+ } CTRL2_B;
+ };
+} DMC_T;
+
+/**
+ * @brief Debug MCU(DBGMCU)
+ */
+typedef struct
+{
+ /** @brief ID register */
+ union
+ {
+ __IM uint32_t IDCODE;
+ struct
+ {
+ __IM uint32_t EQR : 12;
+ __IM uint32_t RESERVED : 4;
+ __IM uint32_t WVR : 16;
+ } IDCODE_B;
+ };
+
+ /** @brief Control register */
+ union
+ {
+ __IOM uint32_t CFG;
+ struct
+ {
+ __IOM uint32_t SLEEP_CLK_STS : 1;
+ __IOM uint32_t STOP_CLK_STS : 1;
+ __IOM uint32_t STANDBY_CLK_STS : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t TRACE_IOEN : 1;
+ __IOM uint32_t TRACE_MODE : 2;
+ __IM uint32_t RESERVED2 : 24;
+ } CFG_B;
+ };
+
+ /** @brief APB1F register */
+ union
+ {
+ __IOM uint32_t APB1F;
+ struct
+ {
+ __IOM uint32_t TMR2_STS : 1;
+ __IOM uint32_t TMR3_STS : 1;
+ __IOM uint32_t TMR4_STS : 1;
+ __IOM uint32_t TMR5_STS : 1;
+ __IOM uint32_t TMR6_STS : 1;
+ __IOM uint32_t TMR7_STS : 1;
+ __IOM uint32_t TMR12_STS : 1;
+ __IOM uint32_t TMR13_STS : 1;
+ __IOM uint32_t TMR14_STS : 1;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t RTC_STS : 1;
+ __IOM uint32_t WWDT_STS : 1;
+ __IOM uint32_t IWDT_STS : 1;
+ __IM uint32_t RESERVED2 : 8;
+ __IOM uint32_t I2C1_SMBUS_TIMEOUT_STS : 1;
+ __IOM uint32_t I2C2_SMBUS_TIMEOUT_STS : 1;
+ __IOM uint32_t I2C3_SMBUS_TIMEOUT_STS : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t CAN1_STS : 1;
+ __IOM uint32_t CAN2_STS : 1;
+ __IM uint32_t RESERVED4 : 5;
+ } APB1F_B;
+ };
+
+ /** @brief APB2F register */
+ union
+ {
+ __IOM uint32_t APB2F;
+ struct
+ {
+ __IOM uint32_t TMR1_STS : 1;
+ __IOM uint32_t TMR8_STS : 1;
+ __IM uint32_t RESERVED1 : 14;
+ __IOM uint32_t TMR9_STS : 1;
+ __IOM uint32_t TMR10_STS : 1;
+ __IOM uint32_t TMR11_STS : 1;
+ __IM uint32_t RESERVED2 : 23;
+ } APB2F_B;
+ };
+} DBGMCU_T;
+
+/**
+ * @brief Digital camera interface (DCI)
+ */
+typedef struct
+{
+ /** @brief DCI control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t CEN : 1;
+ __IOM uint32_t CMODE : 1;
+ __IOM uint32_t CROPF : 1;
+ __IOM uint32_t JPGFM : 1;
+ __IOM uint32_t ESYNCSEL : 1;
+ __IOM uint32_t PXCLKPOL : 1;
+ __IOM uint32_t HSYNCPOL : 1;
+ __IOM uint32_t VSYNCPOL : 1;
+ __IOM uint32_t FCRCFG : 2;
+ __IOM uint32_t EXDMOD : 2;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t DCIEN : 1;
+ __IM uint32_t RESERVED2 : 17;
+ } CTRL_B;
+ } ;
+
+ /** @brief DCI status register*/
+ union
+ {
+ __IM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t HSYNCSTS : 1;
+ __IM uint32_t VSYNCSTS : 1;
+ __IM uint32_t FIFONEMP : 1;
+ __IM uint32_t RESERVED : 29;
+ } STS_B;
+ };
+
+ /** @brief DCI raw interrupt status register */
+ union
+ {
+ __IM uint32_t RINTSTS;
+
+ struct
+ {
+ __IM uint32_t CC_RINT : 1;
+ __IM uint32_t OVR_RINT : 1;
+ __IM uint32_t SYNCERR_RINT : 1;
+ __IM uint32_t VSYNC_RINT : 1;
+ __IM uint32_t LINE_RINT : 1;
+ __IM uint32_t RESERVED : 27;
+ } RINTSTS_B;
+ };
+
+ /** @brief DCI interrupt enable register */
+ union
+ {
+ __IOM uint32_t INTEN;
+
+ struct
+ {
+ __IOM uint32_t CCINTEN : 1;
+ __IOM uint32_t OVRINTEN : 1;
+ __IOM uint32_t SYNCERRINTEN : 1;
+ __IOM uint32_t VSYNCINTEN : 1;
+ __IOM uint32_t LINEINTEN : 1;
+ __IM uint32_t RESERVED : 22;
+ } INTEN_B;
+ };
+
+ /** @brief DCI masked interrupt status register */
+ union
+ {
+ __IM uint32_t MINTSTS;
+
+ struct
+ {
+ __IM uint32_t CC_MINT : 1;
+ __IM uint32_t OVR_MINT : 1;
+ __IM uint32_t SYNCERR_MINT : 1;
+ __IM uint32_t VSYNC_MINT : 1;
+ __IM uint32_t LINE_MINT : 1;
+ __IM uint32_t RESERVED : 27;
+ } MINTSTS_B;
+ };
+
+ /** @brief DCI interrupt clear registe */
+ union
+ {
+ __IOM uint32_t INTCLR;
+
+ struct
+ {
+ __OM uint32_t CC_INTCLR : 1;
+ __OM uint32_t OVR_INTCLR : 1;
+ __OM uint32_t SYNCERR_INTCLR : 1;
+ __OM uint32_t VSYNC_INTCLR : 1;
+ __OM uint32_t LINE_INTCLR : 1;
+ __IM uint32_t RESERVED : 22;
+ } INTCLR_B;
+ };
+
+ /** @brief DCI embedded synchronization code register */
+ union
+ {
+ __IOM uint32_t ESYNCC;
+
+ struct
+ {
+ __IOM uint32_t FSDC : 8;
+ __IOM uint32_t LSDC : 8;
+ __IOM uint32_t LEDC : 8;
+ __IOM uint32_t FEDC : 8;
+ } ESYNCC_B;
+ };
+
+ /** @brief DCI embedded synchronization unmask register */
+ union
+ {
+ __IOM uint32_t ESYNCUM;
+
+ struct
+ {
+ __IOM uint32_t FSDUM : 8;
+ __IOM uint32_t LSDUM : 8;
+ __IOM uint32_t LEDUM : 8;
+ __IOM uint32_t FEDUM : 8;
+ } ESYNCUM_B;
+ };
+
+ /** @brief DCI crop window start register */
+ union
+ {
+ __IOM uint32_t CROPWSTAT;
+
+ struct
+ {
+ __IOM uint32_t HOFSCNT : 14;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t VSLINECNT : 13;
+ __IM uint32_t RESERVED2 : 3;
+ } CROPWSTAT_B;
+ };
+
+ /** @brief DCI crop window size register */
+ union
+ {
+ __IOM uint32_t CROPWSIZE;
+
+ struct
+ {
+ __IOM uint32_t CCNT : 14;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t VLINECNT : 14;
+ __IM uint32_t RESERVED2 : 2;
+ } CROPWSIZE_B;
+ };
+
+ /** @brief DCI data register */
+ union
+ {
+ __IOM uint32_t DATA;
+
+ struct
+ {
+ __IM uint32_t DATA0 : 8;
+ __IM uint32_t DATA1 : 8;
+ __IM uint32_t DATA2 : 8;
+ __IM uint32_t DATA3 : 8;
+ } DATA_B;
+ };
+} DCI_T;
+
+/**
+ * @brief System configuration controller (SYSCFG)
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t MMSEL;
+
+ struct
+ {
+ __IOM uint32_t MMSEL : 2;
+ __IM uint32_t RESERVED : 30;
+ } MMSEL_B;
+ };
+ /** @brief Peripheral Mode Configuration register */
+ union
+ {
+ __IOM uint32_t PMC;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 23;
+ __IOM uint32_t ENETSEL : 1;
+ __IM uint32_t RESERVED2 : 8;
+ } PMC_B;
+ };
+
+ /** @brief External Interrupt Configuration register1 */
+ union
+ {
+ __IOM uint32_t EINTCFG1;
+
+ struct
+ {
+ __IOM uint32_t EINT0 : 4;
+ __IOM uint32_t EINT1 : 4;
+ __IOM uint32_t EINT2 : 4;
+ __IOM uint32_t EINT3 : 4;
+ __IM uint32_t RESERVED : 16;
+ } EINTCFG1_B;
+ };
+
+ /** @brief External Interrupt Configuration register2 */
+ union
+ {
+ __IOM uint32_t EINTCFG2;
+
+ struct
+ {
+ __IOM uint32_t EINT4 : 4;
+ __IOM uint32_t EINT5 : 4;
+ __IOM uint32_t EINT6 : 4;
+ __IOM uint32_t EINT7 : 4;
+ __IM uint32_t RESERVED : 16;
+ } EINTCFG2_B;
+ };
+
+ /** @brief External Interrupt Configuration register3 */
+ union
+ {
+ __IOM uint32_t EINTCFG3;
+
+ struct
+ {
+ __IOM uint32_t EINT8 : 4;
+ __IOM uint32_t EINT9 : 4;
+ __IOM uint32_t EINT10 : 4;
+ __IOM uint32_t EINT11 : 4;
+ __IM uint32_t RESERVED : 16;
+ } EINTCFG3_B;
+ };
+
+ /** @brief External Interrupt Configuration register4 */
+ union
+ {
+ __IOM uint32_t EINTCFG4;
+
+ struct
+ {
+ __IOM uint32_t EINT12 : 4;
+ __IOM uint32_t EINT13 : 4;
+ __IOM uint32_t EINT14 : 4;
+ __IOM uint32_t EINT15 : 4;
+ __IM uint32_t RESERVED : 16;
+ } EINTCFG4_B;
+ };
+
+ __IM uint32_t RESERVED[2];
+
+ /** @brief Compensation cell control register */
+ union
+ {
+ __IOM uint32_t CCCTRL;
+
+ struct
+ {
+ __IOM uint32_t CCPD : 1;
+ __IM uint32_t RESERVED1 : 6;
+ __IOM uint32_t RDYFLG : 1;
+ __IM uint32_t RESERVED : 24;
+ } CCCTRL_B;
+ };
+} SYSCFG_T;
+
+/**
+ * @brief Ethernet (ETH)
+ */
+typedef struct
+{
+ /**
+ * @brief ETH_MAC
+ */
+
+ /** @brief CFG register */
+ union
+ {
+ __IOM uint32_t CFG;
+ struct
+ {
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t RXEN : 1;
+ __IOM uint32_t TXEN : 1;
+ __IOM uint32_t DC : 1;
+ __IOM uint32_t BL : 2;
+ __IOM uint32_t ACS : 1;
+ __IM uint32_t RESERVED2 : 1;
+ __IOM uint32_t DISR : 1;
+ __IOM uint32_t IPC : 1;
+ __IOM uint32_t DM : 1;
+ __IOM uint32_t LBM : 1;
+ __IOM uint32_t DISRXO : 1;
+ __IOM uint32_t SSEL : 1;
+ __IM uint32_t RESERVED3 : 1;
+ __IOM uint32_t DISCRS : 1;
+ __IOM uint32_t IFG : 3;
+ __IM uint32_t RESERVED4 : 2;
+ __IOM uint32_t JDIS : 1;
+ __IOM uint32_t WDTDIS : 1;
+ __IM uint32_t RESERVED5 : 1;
+ __IOM uint32_t CST : 1;
+ __IM uint32_t RESERVED6 : 6;
+ } CFG_B;
+ };
+
+ /** @brief FRAF register */
+ union
+ {
+ __IOM uint32_t FRAF;
+ struct
+ {
+ __IOM uint32_t PR : 1;
+ __IOM uint32_t HUC : 1;
+ __IOM uint32_t HMC : 1;
+ __IOM uint32_t DAIF : 1;
+ __IOM uint32_t PM : 1;
+ __IOM uint32_t DISBF : 1;
+ __IOM uint32_t PCTRLF : 2;
+ __IOM uint32_t SAIF : 1;
+ __IOM uint32_t SAFEN : 1;
+ __IOM uint32_t HPF : 1;
+ __IM uint32_t RESERVED1 : 20;
+ __IOM uint32_t RXA : 1;
+ } FRAF_B;
+ };
+
+ /** @brief HTH register */
+ union
+ {
+ __IOM uint32_t HTH;
+ struct
+ {
+ __IOM uint32_t HTH : 32;
+ } HTH_B;
+ };
+
+ /** @brief HTL register */
+ union
+ {
+ __IOM uint32_t HTL;
+ struct
+ {
+ __IOM uint32_t HTL : 32;
+ } HTL_B;
+ };
+
+ /** @brief ADDR register */
+ union
+ {
+ __IOM uint32_t ADDR;
+ struct
+ {
+ __IOM uint32_t MB : 1;
+ __IOM uint32_t MW : 1;
+ __IOM uint32_t CR : 3;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t MR : 5;
+ __IOM uint32_t PA : 5;
+ __IM uint32_t RESERVED2 : 16;
+ } ADDR_B;
+ };
+
+ /** @brief DATA register */
+ union
+ {
+ __IOM uint32_t DATA;
+ struct
+ {
+ __IOM uint32_t MD : 16;
+ __IM uint32_t RESERVED1 : 16;
+ } DATA_B;
+ };
+
+ /** @brief FCTRL register */
+ union
+ {
+ __IOM uint32_t FCTRL;
+ struct
+ {
+ __IOM uint32_t FCTRLB : 1;
+ __IOM uint32_t TXFCTRLEN : 1;
+ __IOM uint32_t RXFCTRLEN : 1;
+ __IOM uint32_t UNPFDETE : 1;
+ __IOM uint32_t PTSEL : 2;
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t ZQPDIS : 1;
+ __IM uint32_t RESERVED2 : 8;
+ __IOM uint32_t PT : 16;
+ } FCTRL_B;
+ };
+
+ /** @brief VLANT register */
+ union
+ {
+ __IOM uint32_t VLANT;
+ struct
+ {
+ __IOM uint32_t VLANTID : 16;
+ __IOM uint32_t VLANTCOMP : 1;
+ __IM uint32_t RESERVED : 15;
+ } VLANT_B;
+ };
+
+ __IM uint32_t RESERVED0[2];
+
+ /** @brief WKUPFFL register */
+ union
+ {
+ __IOM uint32_t WKUPFFL;
+ struct
+ {
+ __IOM uint32_t FLXBMASK : 32;
+ } WKUPFFL_B;
+ };
+
+ union
+ {
+ __IOM uint32_t PMTCTRLSTS;
+ struct
+ {
+ __IOM uint32_t PD : 1;
+ __IOM uint32_t MPEN : 1;
+ __IOM uint32_t WKUPFEN : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t MPRX : 1;
+ __IOM uint32_t WKUPFRX : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IOM uint32_t GUN : 1;
+ __IM uint32_t RESERVED3 : 21;
+ __IOM uint32_t WKUPFRST : 1;
+ } PMTCTRLSTS_B;
+ } ;
+
+ __IM uint32_t RESERVED1;
+
+ /** @brief DBG register */
+ union
+ {
+ __IM uint32_t DBG;
+ struct
+ {
+ __IM uint32_t RPESTS : 1;
+ __IM uint32_t RFCFCSTS : 2;
+ __IM uint32_t RESERVED1 : 1;
+ __IM uint32_t RWCSTS : 1;
+ __IM uint32_t RRCSTS : 2;
+ __IM uint32_t RESERVED2 : 1;
+ __IM uint32_t RXFSTS : 2;
+ __IM uint32_t RESERVED3 : 6;
+ __IM uint32_t TPESTS : 1;
+ __IM uint32_t TFCSTS : 2;
+ __IM uint32_t TXPAUSED : 1;
+ __IM uint32_t TRCSTS : 2;
+ __IM uint32_t TWCSTS : 1;
+ __IM uint32_t RESERVED4 : 1;
+ __IM uint32_t TXFSTS : 1;
+ __IM uint32_t TXSTSFSTS : 1;
+ __IM uint32_t RESERVED5 : 6;
+ } DBG_B;
+ };
+
+ /** @brief ISTS register */
+ union
+ {
+ __IM uint32_t ISTS;
+ struct
+ {
+ __IM uint32_t RESERVED1 : 3;
+ __IM uint32_t PMTIS : 1;
+ __IM uint32_t MMCIS : 1;
+ __IM uint32_t MMCRXIS : 1;
+ __IM uint32_t MMCTXIS : 1;
+ __IM uint32_t RESERVED2 : 2;
+ __IM uint32_t TSIS : 1;
+ __IM uint32_t RESERVED3 : 22;
+ } ISTS_B;
+ };
+
+ /** @brief IMASK register */
+ union
+ {
+ __IOM uint32_t IMASK;
+ struct
+ {
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t PMTIM : 1;
+ __IM uint32_t RESERVED2 : 5;
+ __IOM uint32_t TSTIM : 1;
+ __IM uint32_t RESERVED3 : 6;
+ } IMASK_B;
+ };
+
+ /** @brief ADDR0H register */
+ union
+ {
+ __IOM uint32_t ADDR0H;
+ struct
+ {
+ __IOM uint32_t ADDR0H : 16;
+ __IM uint32_t RESERVED : 15;
+ __IM uint32_t AL1 : 1;
+ } ADDR0H_B;
+ };
+
+ /** @brief ADDR0L register */
+ union
+ {
+ __IOM uint32_t ADDR0L;
+ struct
+ {
+ __IOM uint32_t ADDR0L : 32;
+ } ADDR0L_B;
+ };
+
+ /** @brief ADDR1H register */
+ union
+ {
+ __IOM uint32_t ADDR1H;
+ struct
+ {
+ __IOM uint32_t ADDR1H : 16;
+ __IM uint32_t RESERVED : 8;
+ __IOM uint32_t MASKBCTRL : 6;
+ __IOM uint32_t ADDRSEL : 1;
+ __IOM uint32_t ADDREN : 1;
+ } ADDR1H_B;
+ };
+
+ /** @brief ADDR1L register */
+ union
+ {
+ __IOM uint32_t ADDR1L;
+ struct
+ {
+ __IOM uint32_t ADDR1L : 32;
+ } ADDR1L_B;
+ };
+
+ /** @brief ADDR2H register */
+ union
+ {
+ __IOM uint32_t ADDR2H;
+ struct
+ {
+ __IOM uint32_t ADDR2H : 16;
+ __IM uint32_t RESERVED : 8;
+ __IOM uint32_t MASKBCTRL : 6;
+ __IOM uint32_t ADDRSEL : 1;
+ __IOM uint32_t ADDREN : 1;
+ } ADDR2H_B;
+ };
+
+ /** @brief ADDR2L register */
+ union
+ {
+ __IOM uint32_t ADDR2L;
+ struct
+ {
+ __IOM uint32_t ADDR2L : 32;
+ } ADDR2L_B;
+ };
+
+ /** @brief ADDR3H register */
+ union
+ {
+ __IOM uint32_t ADDR3H;
+ struct
+ {
+ __IOM uint32_t ADDR3H : 16;
+ __IM uint32_t RESERVED : 8;
+ __IOM uint32_t MASKBCTRL : 6;
+ __IOM uint32_t ADDRSEL : 1;
+ __IOM uint32_t ADDREN : 1;
+ } ADDR3H_B;
+ };
+
+ /** @brief ADDR3L register */
+ union
+ {
+ __IOM uint32_t ADDR3L;
+ struct
+ {
+ __IOM uint32_t ADDR3L : 32;
+ } ADDR3L_B;
+ };
+
+ __IM uint32_t RESERVED2[40];
+ /**
+ * @brief MAC management counters (MMC)
+ */
+ /** Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t CNTRST : 1;
+ __IOM uint32_t CNTSTOPRO : 1;
+ __IOM uint32_t RSTOR : 1;
+ __IOM uint32_t MCNTF : 1;
+ __IOM uint32_t MCNTP : 1;
+ __IOM uint32_t MCNTVALP : 1;
+ __IM uint32_t RESERVED : 26;
+ } CTRL_B;
+ } ;
+
+ /** RX Interrupt Register */
+ union
+ {
+ __IOM uint32_t RXINT;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 5;
+ __IM uint32_t RXFCE : 1;
+ __IM uint32_t RXFAE : 1;
+ __IM uint32_t RESERVED2 : 10;
+ __IM uint32_t RXGUNF : 1;
+ __IM uint32_t RESERVED3 : 14;
+ } RXINT_B;
+ } ;
+
+ /** TX Interrupt Register */
+ union
+ {
+ __IOM uint32_t TXINT;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 14;
+ __IM uint32_t TXGFSCOL : 1;
+ __IM uint32_t TXGFMCOL : 1;
+ __IM uint32_t RESERVED2 : 5;
+ __IM uint32_t TXGF : 1;
+ __IM uint32_t RESERVED3 : 10;
+ } TXINT_B;
+ } ;
+
+ /** RX Interrupt Mask Register */
+ union
+ {
+ __IOM uint32_t RXINTMASK;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 5;
+ __IOM uint32_t RXFCEM : 1;
+ __IOM uint32_t RXFAEM : 1;
+ __IM uint32_t RESERVED2 : 10;
+ __IOM uint32_t RXGUNFM : 1;
+ __IM uint32_t RESERVED3 : 14;
+ } RXINTMASK_B;
+ } ;
+
+ /** TX Interrupt Mask Register */
+ union
+ {
+ __IOM uint32_t TXINTMASK;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 14;
+ __IOM uint32_t TXGFSCOLM : 1;
+ __IOM uint32_t TXGFMCOLM : 1;
+ __IM uint32_t RESERVED2 : 5;
+ __IOM uint32_t TXGFM : 1;
+ __IM uint32_t RESERVED3 : 10;
+ } TXINTMASK_B;
+ } ;
+
+ __IM uint32_t RESERVED3[14];
+
+ /** TX Good Frames After a Single Collision Counter Register */
+ union
+ {
+ __IM uint32_t TXGFSCCNT;
+
+ struct
+ {
+ __IM uint32_t TXGFSCCNT : 32;
+ } TXGFSCCNT_B;
+ } ;
+
+ /** TX Good Frames After More Than a Single Collision Counter Register */
+ union
+ {
+ __IM uint32_t TXGFMCCNT;
+
+ struct
+ {
+ __IM uint32_t TXGFMCCNT : 32;
+ } TXGFMCCNT_B;
+ } ;
+
+ __IM uint32_t RESERVED4[5];
+
+ /** TX Good Frames Counter Register */
+ union
+ {
+ __IM uint32_t TXGFCNT;
+
+ struct
+ {
+ __IM uint32_t TXGFCNT : 32;
+ } TXGFCNT_B;
+ } ;
+
+ __IM uint32_t RESERVED5[10];
+
+ /** RX Frames With CRC Error Counter Register */
+ union
+ {
+ __IM uint32_t RXFCECNT;
+
+ struct
+ {
+ __IM uint32_t RXFCECNT : 32;
+ } RXFCECNT_B;
+ } ;
+
+ /** RX Frames With Alignment Error Counter Register */
+ union
+ {
+ __IM uint32_t RXFAECNT;
+
+ struct
+ {
+ __IM uint32_t RXFAECNT : 32;
+ } RXFAECNT_B;
+ } ;
+
+ __IM uint32_t RESERVED6[10];
+
+ /** RX Good Unicast Frames Counter Register */
+ union
+ {
+ __IM uint32_t RXGUNCNT;
+
+ struct
+ {
+ __IM uint32_t RXGUNCNT : 32;
+ } RXGUNCNT_B;
+ } ;
+
+ __IM uint32_t RESERVED7[334];
+
+ /**
+ * @brief Ethernet: Precision time protocol (Ethernet_PTP)
+ */
+ union
+ {
+ __IOM uint32_t TSCTRL; /*!< (@ 0x00000000) Ethernet PTP time stamp control register */
+
+ struct
+ {
+ __IOM uint32_t TSEN : 1; /*!< [0..0] Time Stamp Enable */
+ __IOM uint32_t TSUDSEL : 1; /*!< [1..1] Time Stamp Update Mode Select */
+ __IOM uint32_t TSSTINIT : 1; /*!< [2..2] Time Stamp System Time Initialize */
+ __IOM uint32_t TSSTUD : 1; /*!< [3..3] Time Stamp System Time Update */
+ __IOM uint32_t TSTRGIEN : 1; /*!< [4..4] Time Stamp Trigger Interrupt Enable */
+ __IOM uint32_t TSADDUD : 1; /*!< [5..5] Time Stamp Addend Register Update */
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t TSSNEN : 1; /*!< [8..8] Time Stamp Snapshot for Received Frames Enable */
+ __IOM uint32_t TSSUBRO : 1; /*!< [9..9] Time Stamp Subsecond Rollover */
+ __IOM uint32_t LISTVSEL : 1; /*!< [10..10] Listening Version Select */
+ __IOM uint32_t TSSPTPEN : 1; /*!< [11..11] Time Stamp Snapshot for PTP Ethernet Frames Enable */
+ __IOM uint32_t TSS6EN : 1; /*!< [12..12] Time stamp snapshot for IPv6 frames Enable */
+ __IOM uint32_t TSS4EN : 1; /*!< [13..13] Time Stamp Snapshot for IPv4 Frames Enable */
+ __IOM uint32_t TSSMESEL : 1; /*!< [14..14] Time Stamp Snapshot for Message Select */
+ __IOM uint32_t TSSMNSEL : 1; /*!< [15..15] Time Stamp Snapshot for Master Node Select */
+ __IOM uint32_t TSCLKNSEL : 2; /*!< [17..16] Time stamp Clock Node Select */
+ __IOM uint32_t TSSPTPFMACEN : 1; /*!< [18..18] Time Stamp PTP Frame Filtering MAC Address Enable */
+ __IM uint32_t RESERVED12 : 13;
+ } TSCTRL_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t SUBSECI; /*!< (@ 0x00000004) Ethernet PTP subsecond increment register */
+
+ struct
+ {
+ __IOM uint32_t STSUBSECI : 8; /*!< [7..0] System Time Subseconds Incremen */
+ __IM uint32_t RESERVED : 24;
+ } SUBSECI_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t TSH; /*!< (@ 0x00000008) Ethernet PTP time stamp high register */
+
+ struct
+ {
+ __IM uint32_t STSEC : 32; /*!< [31..0] System Time Second Value */
+ } TSH_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t TSL; /*!< (@ 0x0000000C) Ethernet PTP time stamp low register */
+
+ struct
+ {
+ __IM uint32_t STSUBSEC : 31; /*!< [30..0] System Time Subseconds Value */
+ __IM uint32_t STSEL : 1; /*!< [31..31] System Time Select */
+ } TSL_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t TSHUD; /*!< (@ 0x00000010) Ethernet PTP time stamp high update register */
+
+ struct
+ {
+ __IOM uint32_t TSUDSEC : 32; /*!< [31..0] Time Stamp Update Second Value */
+ } TSHUD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t TSLUD; /*!< (@ 0x00000014) Ethernet PTP time stamp low update register */
+
+ struct
+ {
+ __IOM uint32_t TSUDSUBSEC : 31; /*!< [30..0] Time Stamp Update Subseconds Value */
+ __IOM uint32_t TSUDSEL : 1; /*!< [31..31] Time Stamp Update Select */
+ } TSLUD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t TSA; /*!< (@ 0x00000018) Ethernet PTP time stamp addend register */
+
+ struct
+ {
+ __IOM uint32_t TSA : 32; /*!< [31..0] Time Stamp Addend Value */
+ } TSA_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t TTSH; /*!< (@ 0x0000001C) Ethernet PTP target time high register */
+
+ struct
+ {
+ __IOM uint32_t TTSH : 32; /*!< [31..0] Target Time Stamp High Value */
+ } TTSH_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t TTSL; /*!< (@ 0x00000020) Ethernet PTP target time low register */
+
+ struct
+ {
+ __IOM uint32_t TTSL : 32; /*!< [31..0] Target Time Stamp Low Value */
+ } TTSL_B;
+ } ;
+ __IM uint32_t RESERVED8;
+
+ union
+ {
+ __IM uint32_t TSSTS; /*!< (@ 0x00000028) Ethernet PTP time stamp status register */
+
+ struct
+ {
+ __IM uint32_t TSSECOVR : 1; /*!< [0..0] Time Stamp Second Value Overflow */
+ __IM uint32_t TTSRD : 1; /*!< [1..1] Target Time Stamp Value Reached */
+ __IM uint32_t RESERVED : 30;
+ } TSSTS_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t PPSCTRL; /*!< (@ 0x0000002C) Ethernet PTP PPS control register */
+
+ struct
+ {
+ __IM uint32_t PPSFSEL : 4; /*!< [3..0] PPS Frequency Selection */
+ __IM uint32_t RESERVED : 28;
+ } PPSCTRL_B;
+ } ;
+
+ __IM uint32_t RESERVED9[564];
+
+ /**
+ * @brief Ethernet: DMA controller operation (Ethernet_DMA)
+ */
+ union
+ {
+ __IOM uint32_t DMABMOD; /*!< (@ 0x00000000) Ethernet DMA bus mode register */
+
+ struct
+ {
+ __IOM uint32_t SWR : 1; /*!< [0..0] Software Reset */
+ __IOM uint32_t DAS : 1; /*!< [1..1] DMA Arbitration Scheme */
+ __IOM uint32_t DSL : 5; /*!< [6..2] Descriptor Skip Length */
+ __IOM uint32_t EDFEN : 1; /*!< [7..7] Enhanced Descriptor Format Enable */
+ __IOM uint32_t PBL : 6; /*!< [13..8] Programmable Burst Length */
+ __IOM uint32_t PR : 2; /*!< [15..14] Priority Ratio */
+ __IOM uint32_t FB : 1; /*!< [16..16] Fixed Burst */
+ __IOM uint32_t RPBL : 6; /*!< [22..17] Rx DMA PBL */
+ __IOM uint32_t USP : 1; /*!< [23..23] Use Separate PBL */
+ __IOM uint32_t PBLx4 : 1; /*!< [24..24] PBLx4 Mode */
+ __IOM uint32_t AAL : 1; /*!< [25..25] Address-Aligned Beats */
+ __IOM uint32_t MB : 1; /*!< [26..26] Mixed Burst */
+ __IM uint32_t RESERVED : 5;
+ } DMABMOD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMATXPD; /*!< (@ 0x00000004) Ethernet DMA transmit poll demand register */
+
+ struct
+ {
+ __IOM uint32_t TXPD : 32; /*!< [31..0] Transmit Poll Demand */
+ } DMATXPD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMARXPD; /*!< (@ 0x00000008) EHERNET DMA receive poll demand register */
+
+ struct
+ {
+ __IOM uint32_t RXPD : 32; /*!< [31..0] Receive Poll Demand */
+ } DMARXPD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMARXDLADDR; /*!< (@ 0x0000000C) Ethernet DMA receive descriptor list address
+ register */
+
+ struct
+ {
+ __IOM uint32_t RXSTA : 32; /*!< [31..0] Start of Receive List */
+ } DMARXDLADDR_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMATXDLADDR; /*!< (@ 0x00000010) Ethernet DMA transmit descriptor list address
+ register */
+
+ struct
+ {
+ __IOM uint32_t TXSTA : 32; /*!< [31..0] Start of Transmit List */
+ } DMATXDLADDR_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMASTS; /*!< (@ 0x00000014) Ethernet DMA status register */
+
+ struct
+ {
+ __IOM uint32_t TXFLG : 1; /*!< [0..0] Transmit Flag */
+ __IOM uint32_t TXSFLG : 1; /*!< [1..1] Transmit Stopped Flag */
+ __IOM uint32_t TXBU : 1; /*!< [2..2] Transmit Buffer Unavailable */
+ __IOM uint32_t TXJTO : 1; /*!< [3..3] Transmit Jabber Timeout */
+ __IOM uint32_t RXOVF : 1; /*!< [4..4] Receive Overflow */
+ __IOM uint32_t TXUNF : 1; /*!< [5..5] Transmit Underflow */
+ __IOM uint32_t RXFLG : 1; /*!< [6..6] Receive Flag */
+ __IOM uint32_t RXBU : 1; /*!< [7..7] Receive Buffer Unavailable */
+ __IOM uint32_t RXSFLG : 1; /*!< [8..8] Receive Stopped Flag */
+ __IOM uint32_t RXWTOFLG : 1; /*!< [9..9] Receive Watchdog Timeout Flag */
+ __IOM uint32_t ETXFLG : 1; /*!< [10..10] Early Transmit Flag */
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t FBERRFLG : 1; /*!< [13..13] Fatal Bus Error Flag */
+ __IOM uint32_t ERXFLG : 1; /*!< [14..14] Early Receive Flag */
+ __IOM uint32_t AINTS : 1; /*!< [15..15] Abnormal Interrupt Summary */
+ __IOM uint32_t NINTS : 1; /*!< [16..16] Normal Interrupt Summary */
+ __IM uint32_t RXSTS : 3; /*!< [19..17] Receive Process State */
+ __IM uint32_t TXSTS : 3; /*!< [22..20] Transmit Process State */
+ __IM uint32_t ERRB : 3; /*!< [25..23] Error Bits */
+ __IM uint32_t RESERVED2 : 1;
+ __IM uint32_t MMCFLG : 1; /*!< [27..27] MMC Flag */
+ __IM uint32_t PMTFLG : 1; /*!< [28..28] PMT Flag */
+ __IM uint32_t TSTFLG : 1; /*!< [29..29] Timestamp Trigger Flag */
+ __IM uint32_t RESERVED3 : 2;
+ } DMASTS_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMAOPMOD; /*!< (@ 0x00000018) Ethernet DMA operation mode register */
+
+ struct
+ {
+ __IM uint32_t : 1;
+ __IOM uint32_t STRX : 1; /*!< [1..1] Start or Stop Receive */
+ __IOM uint32_t OSECF : 1; /*!< [2..2] Operate on Second Frame */
+ __IOM uint32_t RXTHCTRL : 2; /*!< [4..3] Receive Threshold Control */
+ __IM uint32_t RESERVED1 : 1;
+ __IOM uint32_t FUF : 1; /*!< [6..6] Forward Undersized Good Frames */
+ __IOM uint32_t FERRF : 1; /*!< [7..7] Forward Error Frames */
+ __IM uint32_t RESERVED2 : 5;
+ __IOM uint32_t STTX : 1; /*!< [13..13] Start or Stop Transmission Command */
+ __IOM uint32_t TXTHCTRL : 3; /*!< [16..14] Transmit Threshold Control */
+ __IM uint32_t RESERVED3 : 3;
+ __IOM uint32_t FTXF : 1; /*!< [20..20] Flush Transmit FIFO */
+ __IOM uint32_t TXSF : 1; /*!< [21..21] Transmit Store and Forward */
+ __IM uint32_t RESERVED4 : 2;
+ __IOM uint32_t DISFRXF : 1; /*!< [24..24] Disable Flushing of Received Frames */
+ __IOM uint32_t RXSF : 1; /*!< [25..25] Receive Store and Forward */
+ __IOM uint32_t DISDT : 1; /*!< [26..26] Disable Dropping of TCP/IP Checksum Error Frames */
+ __IM uint32_t RESERVED5 : 5;
+ } DMAOPMOD_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMAINTEN; /*!< (@ 0x0000001C) Ethernet DMA interrupt enable register */
+
+ struct
+ {
+ __IOM uint32_t TXIEN : 1; /*!< [0..0] Transmit Interrupt Enable */
+ __IOM uint32_t TXSEN : 1; /*!< [1..1] Transmit Stopped Enable */
+ __IOM uint32_t TXBUEN : 1; /*!< [2..2] Transmit Buffer Unavailable Enable */
+ __IOM uint32_t TXJTOEN : 1; /*!< [3..3] Transmit Jabber Timeout Enable */
+ __IOM uint32_t RXOVFEN : 1; /*!< [4..4] Receive Overflow Interrupt Enable */
+ __IOM uint32_t TXUNFEN : 1; /*!< [5..5] Transmit Underflow Interrupt Enable */
+ __IOM uint32_t RXIEN : 1; /*!< [6..6] Receive Interrupt Enable */
+ __IOM uint32_t RXBUEN : 1; /*!< [7..7] Receive Buffer Unavailable Enable */
+ __IOM uint32_t RXSEN : 1; /*!< [8..8] Receive Stopped Enable */
+ __IOM uint32_t RXWTOEN : 1; /*!< [9..9] Receive Watchdog Timeout Enable */
+ __IOM uint32_t ETXIEN : 1; /*!< [10..10] Early Transmit Interrupt Enable */
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t FBERREN : 1; /*!< [13..13] Fatal Bus Error Enable */
+ __IOM uint32_t ERXIEN : 1; /*!< [14..14] Early Receive Interrupt Enable */
+ __IOM uint32_t AINTSEN : 1; /*!< [15..15] Abnormal Interrupt Summary Enable */
+ __IOM uint32_t NINTSEN : 1; /*!< [16..16] Normal Interrupt Summary Enable */
+ __IM uint32_t RESERVED2 : 15;
+ } DMAINTEN_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMAMFABOCNT; /*!< (@ 0x00000020) Ethernet DMA missed frame and buffer overflow
+ counter register */
+
+ struct
+ {
+ __IOM uint32_t MISFCNT : 16; /*!< [15..0] Missed Frame Counter */
+ __IOM uint32_t MISFCNTOVF : 1; /*!< [16..16] Overflow Bit for Missed Frame Counter */
+ __IOM uint32_t AMISFCNT : 11; /*!< [27..17] Missed Frame Counter */
+ __IOM uint32_t OVFCNTOVF : 1; /*!< [28..28] Overflow Bit for FIFO Overflow Counter */
+ __IM uint32_t RESERVED : 3;
+ } DMAMFABOCNT_B;
+ } ;
+
+ union
+ {
+ __IOM uint32_t DMARXFLGWDT; /*!< (@ 0x00000024) Ethernet DMA receive status watchdog timer register */
+
+ struct
+ {
+ __IOM uint32_t RXWDTCNT : 8; /*!< [7..0] RXFLG Watchdog Timer Count */
+ __IM uint32_t RESERVED : 24;
+ } DMARXFLGWDT_B;
+ } ;
+ __IM uint32_t RESERVED10[8];
+
+ union
+ {
+ __IM uint32_t DMAHTXD; /*!< (@ 0x00000048) Ethernet DMA current host transmit descriptor
+ register */
+ struct
+ {
+ __IM uint32_t HTXDADDRP : 32; /*!< [31..0] Host Transmit Descriptor Address Pointer */
+ } DMAHTXD_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t DMAHRXD; /*!< (@ 0x0000004C) Ethernet DMA current host receive descriptor
+ register */
+ struct
+ {
+ __IM uint32_t HRXDADDRP : 32; /*!< [31..0] Host Receive Descriptor Address Pointer */
+ } DMAHRXD_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t DMAHTXBADDR; /*!< (@ 0x00000050) Ethernet DMA current host transmit buffer address
+ register */
+
+ struct
+ {
+ __IM uint32_t HTXBADDRP : 32; /*!< [31..0] Host Transmit Buffer Address Pointer */
+ } DMAHTXBADDR_B;
+ } ;
+
+ union
+ {
+ __IM uint32_t DMAHRXBADDR; /*!< (@ 0x00000054) Ethernet DMA current host receive buffer address
+ register */
+
+ struct
+ {
+ __IM uint32_t HRXBADDRP : 32; /*!< [31..0] Host Receive Buffer Address Pointer */
+ } DMAHRXBADDR_B;
+ } ;
+} ETH_T;
+
+/**
+ * @brief CRYP register (CRYP)
+ */
+typedef struct
+{
+ /** @brief Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t ALGODIRSEL : 1;
+ __IOM uint32_t ALGOMSEL : 3;
+ __IOM uint32_t DTSEL : 2;
+ __IOM uint32_t KSIZESEL : 2;
+ __IM uint32_t RESERVED2 : 4;
+ __IOM uint32_t FFLUSH : 1;
+ __IOM uint32_t CRYPEN : 1;
+ __IM uint32_t RESERVED3 : 16;
+ } CTRL_B;
+ } ;
+
+ /** @brief Status register */
+ union
+ {
+ __IM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t IFEMPT : 1;
+ __IM uint32_t IFFULL : 1;
+ __IM uint32_t OFEMPT : 1;
+ __IM uint32_t OFFULL : 1;
+ __IM uint32_t BUSY : 1;
+ __IM uint32_t RESERVED : 27;
+ } STS_B;
+ };
+
+ /** @brief Input data register */
+ union
+ {
+ __IOM uint32_t DATAIN;
+
+ struct
+ {
+ __IOM uint32_t DATAIN : 32;
+ } DATAIN_B;
+ };
+
+ /** @brief Output data register */
+ union
+ {
+ __IM uint32_t DATAOUT;
+
+ struct
+ {
+ __IM uint32_t DATAOUT : 32;
+ } DATAOUT_B;
+ };
+
+ /** @brief Control DMA register */
+ union
+ {
+ __IOM uint32_t DMACTRL;
+
+ struct
+ {
+ __IOM uint32_t INEN : 1;
+ __IOM uint32_t OUTEN : 1;
+ __IM uint32_t RESERVED : 30;
+ } DMACTRL_B;
+ };
+
+ /** @brief Service Interrupt Mask register */
+ union
+ {
+ __IOM uint32_t INTMASK;
+
+ struct
+ {
+ __IOM uint32_t INIMASK : 1;
+ __IOM uint32_t OUTIMASK : 1;
+ __IM uint32_t RESERVED : 30;
+ } INTMASK_B;
+ };
+
+ /** @brief Service Raw Interrupt Status register */
+ union
+ {
+ __IM uint32_t INTSTS;
+
+ struct
+ {
+ __IM uint32_t INISTS : 1;
+ __IM uint32_t OUTISTS : 1;
+ __IM uint32_t RESERVED : 30;
+ } INTSTS_B;
+ };
+
+ /** @brief Service Masked Interrupt Status register */
+ union
+ {
+ __IOM uint32_t MINTSTS;
+
+ struct
+ {
+ __IOM uint32_t INMISTS : 1;
+ __IOM uint32_t OUTMISTS : 1;
+ __IM uint32_t RESERVED : 30;
+ } MINTSTS_B;
+ };
+
+ /** @brief Key0L register */
+ union
+ {
+ __OM uint32_t K0L;
+
+ struct
+ {
+ __OM uint32_t Bx : 32;
+ } K0L_B;
+ };
+
+ /** @brief Key0R register */
+ union
+ {
+ __OM uint32_t K0R;
+
+ struct
+ {
+ __OM uint32_t Bx : 32;
+ } K0R_B;
+ };
+
+ /** @brief Key1L register */
+ union
+ {
+ __OM uint32_t K1L;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K1L_B;
+ };
+
+ /** @brief Key1R register */
+ union
+ {
+ __OM uint32_t K1R;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K1R_B;
+ };
+
+ /** @brief Key2L register */
+ union
+ {
+ __OM uint32_t K2L;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K2L_B;
+ };
+
+ /** @brief Key2R register */
+ union
+ {
+ __OM uint32_t K2R;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K2R_B;
+ };
+
+ /** @brief Key3L register */
+ union
+ {
+ __OM uint32_t K3L;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K3L_B;
+ };
+
+ /** @brief Key3R register */
+ union
+ {
+ __OM uint32_t K3R;
+
+ struct
+ {
+ __OM uint32_t KyBx : 32;
+ } K3R_B;
+ };
+
+ /** @brief Initialize the vector register register */
+ union
+ {
+ __IOM uint32_t IV0L;
+
+ struct
+ {
+ __IOM uint32_t IVx : 32;
+ } IV0L_B;
+ };
+
+ /** @brief Initialize the vector register register */
+ union
+ {
+ __IOM uint32_t IV0R;
+
+ struct
+ {
+ __IOM uint32_t IVx : 32;
+ } IV0R_B;
+ };
+
+ /** @brief Initialize the vector register register */
+ union
+ {
+ __IOM uint32_t IV1L;
+
+ struct
+ {
+ __IOM uint32_t IVx : 32;
+ } IV1L_B;
+ };
+
+ /** @brief Initialize the vector register register */
+ union
+ {
+ __IOM uint32_t IV1R;
+
+ struct
+ {
+ __IOM uint32_t IVx : 32;
+ } IV1R_B;
+ };
+
+} CRYP_T;
+
+/**
+* @brief The registers of HASH
+*/
+typedef struct
+{
+ /** @brief HASH Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+ struct
+ {
+ __IM uint32_t RESERVED : 2;
+ __IOM uint32_t INITCAL : 1;
+ __IOM uint32_t DMAEN : 1;
+ __IOM uint32_t DTYPE : 2;
+ __IOM uint32_t MODESEL : 1;
+ __IOM uint32_t ALGSEL : 1;
+ __IOM uint32_t WNUM : 4;
+ __IOM uint32_t DINNEMPT : 1;
+ __IM uint32_t RESERVED1 : 3;
+ __IOM uint32_t LKEYSEL : 1;
+ __IM uint32_t RESERVED2 : 15;
+ } CTRL_B;
+ };
+
+ /** @brief HASH Input Data register */
+ union
+ {
+ __IOM uint32_t INDATA;
+ struct
+ {
+ __IOM uint32_t INDATA : 32;
+ } INDATA_B;
+ };
+
+ /** @brief HASH Start register */
+ union
+ {
+ __IOM uint32_t START;
+ struct
+ {
+ __IOM uint32_t LWNUM : 5;
+ __IM uint32_t RESERVED1 : 3;
+ __OM uint32_t DIGCAL : 1;
+ __IM uint32_t RESERVED2 : 23;
+ } START_B;
+ };
+
+ __IO uint32_t DIG[5];
+
+ /** @brief HASH interrupt enable register */
+ union
+ {
+ __IOM uint32_t INT;
+ struct
+ {
+ __IOM uint32_t INDATA : 1;
+ __IOM uint32_t DCALC : 1;
+ __IM uint32_t RESERVED1 : 30;
+ } INT_B;
+ };
+
+ /** @brief HASH status register */
+ union
+ {
+ __IOM uint32_t STS;
+ struct
+ {
+ __IOM uint32_t INDATAINT : 1;
+ __IOM uint32_t DCALCINT : 1;
+ __IM uint32_t DMA : 1;
+ __IM uint32_t BUSY : 1;
+ __IM uint32_t RESERVED : 28;
+ } STS_B;
+ };
+
+ __IM uint32_t RESERVED[52];
+
+ /** @brief HASH context swap register */
+ __IOM uint32_t CTSWAP[51];
+
+} HASH_T;
+
+/**
+ * @brief Random Number Generator (RNG)
+ */
+typedef struct
+{
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IM uint32_t RESERVED : 2;
+ __IOM uint32_t RNGEN : 1;
+ __IOM uint32_t INTEN : 1;
+ __IM uint32_t RESERVED1 : 28;
+ } CTRL_B;
+ };
+
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IM uint32_t DATARDY : 1;
+ __IM uint32_t CLKERCSTS : 1;
+ __IM uint32_t FSCSTS : 1;
+ __IM uint32_t RESERVED1 : 2;
+ __IOM uint32_t CLKERINT : 1;
+ __IOM uint32_t FSINT : 1;
+ __IM uint32_t RESERVED2 : 25;
+ } STS_B;
+ };
+
+ union
+ {
+ __IM uint32_t DATA;
+
+ struct
+ {
+ __IM uint32_t DATA : 32;
+ } DATA_B;
+ };
+} RNG_T;
+
+/**
+ * @brief Big number module (BN)
+ */
+typedef struct
+{
+ /** Revision register */
+ union
+ {
+ __IM uint32_t REV;
+
+ struct
+ {
+ __IM uint32_t MIN : 8;
+ __IM uint32_t MID : 8;
+ __IM uint32_t MAJ : 8;
+ __IM uint32_t RESERVED : 8;
+ } REV_B;
+
+ };
+
+ /** Control register */
+ union
+ {
+ __IOM uint32_t CTRL;
+
+ struct
+ {
+ __IOM uint32_t START : 1;
+ __IOM uint32_t SP : 3;
+ __IOM uint32_t SELECT : 4;
+ __IOM uint32_t IE : 1;
+ __IM uint32_t RESERVED : 23;
+ } CTRL_B;
+
+ };
+
+ /** Status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t DONE : 1;
+ __IOM uint32_t BUSY : 1;
+ __IOM uint32_t CB : 1;
+ __IOM uint32_t TR : 1;
+ __IOM uint32_t INF : 1;
+ __IOM uint32_t EVEN : 1;
+ __IOM uint32_t ZERO : 1;
+ __IOM uint32_t ATTACKED : 1;
+ __IOM uint32_t BLEN : 16;
+ __IM uint32_t RESERVED : 8;
+ } STS_B;
+ };
+
+ /** Operand Width register */
+ union
+ {
+ __IOM uint32_t WID;
+
+ struct
+ {
+ __IOM uint32_t PWID : 7;
+ __IM uint32_t RESERVED : 7;
+ __IOM uint32_t KWID : 4;
+ __IM uint32_t RESERVED1 : 14;
+ } WID_B;
+ };
+
+ __IM uint32_t RESERVED[12];
+
+ /** P/M register */
+ union
+ {
+ __IOM uint32_t P;
+
+ struct
+ {
+ __IOM uint32_t P_REG : 32;
+ } P_B;
+ };
+
+ /** BN Operand/EC Parameter A register */
+ union
+ {
+ __IOM uint32_t A;
+
+ struct
+ {
+ __IOM uint32_t A_REG : 32;
+ } A_B;
+ };
+
+ /** BN Operand/EC Parameter B register */
+ union
+ {
+ __IOM uint32_t B;
+
+ struct
+ {
+ __IOM uint32_t B_REG : 32;
+ } B_B;
+ };
+
+ /** ECC Key register */
+ union
+ {
+ __IOM uint32_t K;
+
+ struct
+ {
+ __IOM uint32_t K_REG : 32;
+ } K_B;
+ };
+
+ /** ECC PX register */
+ union
+ {
+ __IOM uint32_t PX;
+
+ struct
+ {
+ __IOM uint32_t PX_REG : 32;
+ } PX_B;
+ };
+
+ /** ECC PY register */
+ union
+ {
+ __IOM uint32_t PY;
+
+ struct
+ {
+ __IOM uint32_t PY_REG : 32;
+ } PY_B;
+ };
+
+ /** ECC SX register */
+ union
+ {
+ __IOM uint32_t SX;
+
+ struct
+ {
+ __IOM uint32_t SX_REG : 32;
+ } SX_B;
+ };
+
+ /** ECC SY register */
+ union
+ {
+ __IOM uint32_t SY;
+
+ struct
+ {
+ __IOM uint32_t SY_REG : 32;
+ } SY_B;
+ };
+
+ /** ECC Result RX register */
+ union
+ {
+ __IM uint32_t RX;
+
+ struct
+ {
+ __IM uint32_t RX_REG : 32;
+ } RX_B;
+ };
+
+ /** ECC Result RY register */
+ union
+ {
+ __IM uint32_t RY;
+
+ struct
+ {
+ __IM uint32_t RY_REG : 32;
+ } RY_B;
+ };
+
+ /** Random Number register */
+ union
+ {
+ __OM uint32_t RND;
+
+ struct
+ {
+ __OM uint32_t RND_REG : 32;
+ } RND_B;
+ };
+} BN_T;
+
+/**
+ * @brief SM3
+ */
+typedef struct
+{
+ /** Revision Register */
+ union
+ {
+ __IM uint32_t REV;
+
+ struct
+ {
+ __IM uint32_t MIN : 8;
+ __IM uint32_t MID : 8;
+ __IM uint32_t MAJ : 8;
+ __IM uint32_t RESERVED : 8;
+ } REV_B;
+ };
+
+ /** Control Register */
+ union
+ {
+ __IOM uint32_t CTRL;
+ struct
+ {
+ __IOM uint32_t START : 1;
+ __IOM uint32_t IE : 1;
+ __IOM uint32_t INIMODE : 1;
+ __IOM uint32_t PADMODE : 1;
+ __IOM uint32_t ENDIAN : 1;
+ __IM uint32_t RESERVED : 27;
+ } CTRL_B;
+ };
+
+ /** Status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t DONE : 1;
+ __IOM uint32_t BUSY : 1;
+ __IM uint32_t RESERVED : 30;
+ } STS_B;
+ };
+
+ /** Data input length higher register */
+ union
+ {
+ __OM uint32_t DILH;
+
+ struct
+ {
+ __OM uint32_t DATA : 32;
+ } DILH_B;
+ };
+
+ /** Data input length lower register */
+ union
+ {
+ __OM uint32_t DILL;
+
+ struct
+ {
+ __OM uint32_t DATA : 32;
+ } DILL_B;
+ };
+
+ /** Input Message Register[16](0x14-0x50) */
+ __IOM uint32_t DIN[16];
+ /** Input Initial Value[8](0x54-0x70) */
+ __IOM uint32_t IV[8];
+ /** Output Message Digest Register[8](0x74-0x90) */
+ __IM uint32_t DOUT[8];
+
+} SM3_T;
+
+/**
+ * @brief SM4
+ */
+typedef struct
+{
+ /** Revision Register */
+ union
+ {
+ __IM uint32_t REV;
+
+ struct
+ {
+ __IM uint32_t REC0 : 1;
+ __IM uint32_t REC1 : 1;
+ __IM uint32_t RESERVED : 30;
+ } REV_B;
+ };
+
+ /** Control Register */
+ union
+ {
+ __IOM uint32_t CTRL;
+ struct
+ {
+ __IOM uint32_t START : 1;
+ __IOM uint32_t IE : 1;
+ __IOM uint32_t ENC : 1;
+ __IOM uint32_t MODE : 1;
+ __IM uint32_t RESERVED : 28;
+ } CTRL_B;
+ };
+
+ /** Status register */
+ union
+ {
+ __IOM uint32_t STS;
+
+ struct
+ {
+ __IOM uint32_t DONE : 1;
+ __IOM uint32_t BUSY : 1;
+ __IM uint32_t RESERVED : 30;
+ } STS_B;
+ };
+
+ /** SM4 key register[4](0x0C..0x18) */
+ __IOM uint32_t KEY[4];
+ /** SM4 data input register[4](0x20..0x28) */
+ __IOM uint32_t DIN[4];
+ /** SM4 data output register[4](0x2C..0x38) */
+ __IM uint32_t DOUT[4];
+ /** SM4 initial vector register[4](0x3C..0x48) */
+ __IOM uint32_t IV[4];
+} SM4_T;
+
+/**
+ * @}
+ */
+
+/** @defgroup Peripheral_memory_map
+ @{
+*/
+
+/** FMC base address in the alias region */
+#define FMC_BASE ((uint32_t)0x08000000)
+/** CCM(core coupled memory) data RAM(64 KB) base address in the alias region */
+#define CCMDATARAM_BASE ((uint32_t)0x10000000)
+/** SRAM1 base address in the alias region */
+#define SRAM1_BASE ((uint32_t)0x20000000)
+/** SRAM2 base address in the alias region */
+#define SRAM2_BASE ((uint32_t)0x2001C000)
+/** SRAM3 base address in the alias region */
+#define SRAM3_BASE ((uint32_t)0x20020000)
+/** Peripheral base address in the alias region */
+#define PERIPH_BASE ((uint32_t)0x40000000)
+/** Backup SRAM(4 KB) base address in the alias region */
+#define BKPSRAM_BASE ((uint32_t)0x40024000)
+
+/** CCM(core coupled memory) data RAM(64 KB) base address in the bit-band region */
+#define CCMDATARAM_BB_BASE ((uint32_t)0x12000000)
+/** SRAM1 base address in the bit-band region */
+#define SRAM1_BB_BASE ((uint32_t)0x22000000)
+/** SRAM2 base address in the bit-band region */
+#define SRAM2_BB_BASE ((uint32_t)0x22380000)
+/** SRAM3 base address in the bit-band region */
+#define SRAM3_BB_BASE ((uint32_t)0x22400000)
+/** Peripheral base address in the bit-band region */
+#define PERIPH_BB_BASE ((uint32_t)0x42000000)
+/** Backup SRAM(4 KB) base address in the bit-band region */
+#define BKPSRAM_BB_BASE ((uint32_t)0x42480000)
+
+/** Legacy defines */
+#define SRAM_BASE SRAM1_BASE
+#define SRAM_BB_BASE SRAM1_BB_BASE
+
+/** EMMC registers base address */
+#define SMC_R_BASE ((uint32_t)0xA0000000)
+/** QSPI registers base address */
+#define QSPI_BASE ((uint32_t)0xA0000000)
+/** DMC registers base address */
+#define DMC_BASE ((uint32_t)0xA0000000)
+
+/** Peripheral memory map */
+#define APB1PERIPH_BASE PERIPH_BASE
+#define APB2PERIPH_BASE (PERIPH_BASE + 0x00010000)
+#define AHB1PERIPH_BASE (PERIPH_BASE + 0x00020000)
+#define AHB2PERIPH_BASE (PERIPH_BASE + 0x10000000)
+
+/** APB1 peripherals */
+#define TMR2_BASE (APB1PERIPH_BASE + 0x0000)
+#define TMR3_BASE (APB1PERIPH_BASE + 0x0400)
+#define TMR4_BASE (APB1PERIPH_BASE + 0x0800)
+#define TMR5_BASE (APB1PERIPH_BASE + 0x0C00)
+#define TMR6_BASE (APB1PERIPH_BASE + 0x1000)
+#define TMR7_BASE (APB1PERIPH_BASE + 0x1400)
+#define TMR12_BASE (APB1PERIPH_BASE + 0x1800)
+#define TMR13_BASE (APB1PERIPH_BASE + 0x1C00)
+#define TMR14_BASE (APB1PERIPH_BASE + 0x2000)
+#define RTC_BASE (APB1PERIPH_BASE + 0x2800)
+#define WWDT_BASE (APB1PERIPH_BASE + 0x2C00)
+#define IWDT_BASE (APB1PERIPH_BASE + 0x3000)
+#define I2S2ext_BASE (APB1PERIPH_BASE + 0x3400)
+#define SPI2_BASE (APB1PERIPH_BASE + 0x3800)
+#define SPI3_BASE (APB1PERIPH_BASE + 0x3C00)
+#define I2S3ext_BASE (APB1PERIPH_BASE + 0x4000)
+#define USART2_BASE (APB1PERIPH_BASE + 0x4400)
+#define USART3_BASE (APB1PERIPH_BASE + 0x4800)
+#define UART4_BASE (APB1PERIPH_BASE + 0x4C00)
+#define UART5_BASE (APB1PERIPH_BASE + 0x5000)
+#define I2C1_BASE (APB1PERIPH_BASE + 0x5400)
+#define I2C2_BASE (APB1PERIPH_BASE + 0x5800)
+#define I2C3_BASE (APB1PERIPH_BASE + 0x5C00)
+#define CAN1_BASE (APB1PERIPH_BASE + 0x6400)
+#define CAN2_BASE (APB1PERIPH_BASE + 0x6800)
+#define PMU_BASE (APB1PERIPH_BASE + 0x7000)
+#define DAC_BASE (APB1PERIPH_BASE + 0x7400)
+#define UART7_BASE (APB1PERIPH_BASE + 0x7800)
+#define UART8_BASE (APB1PERIPH_BASE + 0x7C00)
+
+/** APB2 peripherals */
+#define TMR1_BASE (APB2PERIPH_BASE + 0x0000)
+#define TMR8_BASE (APB2PERIPH_BASE + 0x0400)
+#define USART1_BASE (APB2PERIPH_BASE + 0x1000)
+#define USART6_BASE (APB2PERIPH_BASE + 0x1400)
+#define UART9_BASE (APB2PERIPH_BASE + 0x1800U)
+#define UART10_BASE (APB2PERIPH_BASE + 0x1C00U)
+#define ADC1_BASE (APB2PERIPH_BASE + 0x2000)
+#define ADC2_BASE (APB2PERIPH_BASE + 0x2100)
+#define ADC3_BASE (APB2PERIPH_BASE + 0x2200)
+#define ADC_BASE (APB2PERIPH_BASE + 0x2300)
+#define SDIO_BASE (APB2PERIPH_BASE + 0x2C00)
+#define SPI1_BASE (APB2PERIPH_BASE + 0x3000)
+#define SPI4_BASE (APB2PERIPH_BASE + 0x3400)
+#define SYSCFG_BASE (APB2PERIPH_BASE + 0x3800)
+#define EINT_BASE (APB2PERIPH_BASE + 0x3C00)
+#define TMR9_BASE (APB2PERIPH_BASE + 0x4000)
+#define TMR10_BASE (APB2PERIPH_BASE + 0x4400)
+#define TMR11_BASE (APB2PERIPH_BASE + 0x4800)
+#define SPI5_BASE (APB2PERIPH_BASE + 0x5000)
+#define SPI6_BASE (APB2PERIPH_BASE + 0x5400)
+#define SAI1_BASE (APB2PERIPH_BASE + 0x5800)
+#define SAI1_Block_A_BASE (SAI1_BASE + 0x004)
+#define SAI1_Block_B_BASE (SAI1_BASE + 0x024)
+
+/** AHB1 peripherals */
+#define GPIOA_BASE (AHB1PERIPH_BASE + 0x0000)
+#define GPIOB_BASE (AHB1PERIPH_BASE + 0x0400)
+#define GPIOC_BASE (AHB1PERIPH_BASE + 0x0800)
+#define GPIOD_BASE (AHB1PERIPH_BASE + 0x0C00)
+#define GPIOE_BASE (AHB1PERIPH_BASE + 0x1000)
+#define GPIOF_BASE (AHB1PERIPH_BASE + 0x1400)
+#define GPIOG_BASE (AHB1PERIPH_BASE + 0x1800)
+#define GPIOH_BASE (AHB1PERIPH_BASE + 0x1C00)
+#define GPIOI_BASE (AHB1PERIPH_BASE + 0x2000)
+#define GPIOJ_BASE (AHB1PERIPH_BASE + 0x2400)
+#define GPIOK_BASE (AHB1PERIPH_BASE + 0x2800)
+#define CRC_BASE (AHB1PERIPH_BASE + 0x3000)
+#define RCM_BASE (AHB1PERIPH_BASE + 0x3800)
+#define FMC_R_BASE (AHB1PERIPH_BASE + 0x3C00)
+#define DMA1_BASE (AHB1PERIPH_BASE + 0x6000)
+#define DMA1_Stream0_BASE (DMA1_BASE + 0x010)
+#define DMA1_Stream1_BASE (DMA1_BASE + 0x028)
+#define DMA1_Stream2_BASE (DMA1_BASE + 0x040)
+#define DMA1_Stream3_BASE (DMA1_BASE + 0x058)
+#define DMA1_Stream4_BASE (DMA1_BASE + 0x070)
+#define DMA1_Stream5_BASE (DMA1_BASE + 0x088)
+#define DMA1_Stream6_BASE (DMA1_BASE + 0x0A0)
+#define DMA1_Stream7_BASE (DMA1_BASE + 0x0B8)
+#define DMA2_BASE (AHB1PERIPH_BASE + 0x6400)
+#define DMA2_Stream0_BASE (DMA2_BASE + 0x010)
+#define DMA2_Stream1_BASE (DMA2_BASE + 0x028)
+#define DMA2_Stream2_BASE (DMA2_BASE + 0x040)
+#define DMA2_Stream3_BASE (DMA2_BASE + 0x058)
+#define DMA2_Stream4_BASE (DMA2_BASE + 0x070)
+#define DMA2_Stream5_BASE (DMA2_BASE + 0x088)
+#define DMA2_Stream6_BASE (DMA2_BASE + 0x0A0)
+#define DMA2_Stream7_BASE (DMA2_BASE + 0x0B8)
+#define ETH_BASE (AHB1PERIPH_BASE + 0x8000)
+#define ETH_MAC_BASE (ETH_BASE)
+#define ETH_MMC_BASE (ETH_BASE + 0x0100)
+#define ETH_PTP_BASE (ETH_BASE + 0x0700)
+#define ETH_DMA_BASE (ETH_BASE + 0x1000)
+#define DMA2D_BASE (AHB1PERIPH_BASE + 0xB000)
+
+/** AHB2 peripherals */
+#define DCI_BASE (AHB2PERIPH_BASE + 0x50000)
+#define CRYP_BASE (AHB2PERIPH_BASE + 0x60000)
+#define HASH_BASE (AHB2PERIPH_BASE + 0x60400)
+#define HASH_DIGEST_BASE (AHB2PERIPH_BASE + 0x60710)
+#define RNG_BASE (AHB2PERIPH_BASE + 0x60800)
+
+#define SMC_Bank1_R_BASE (SMC_R_BASE + 0x0000)
+#define SMC_Bank1E_R_BASE (SMC_R_BASE + 0x0104)
+#define SMC_Bank2_R_BASE (SMC_R_BASE + 0x0060)
+#define SMC_Bank3_R_BASE (SMC_R_BASE + 0x0080)
+#define SMC_Bank4_R_BASE (SMC_R_BASE + 0x00A0)
+
+/* Debug MCU registers base address */
+#define DBGMCU_BASE ((uint32_t )0xE0042000)
+/** Big number (BN) registers base address */
+#define BN_BASE ((uint32_t )0x500A0000)
+/** SM3 registers base address */
+#define SM3_BASE ((uint32_t )0x50080000)
+/** SM4 registers base address */
+#define SM4_BASE ((uint32_t )0x50080400)
+
+/**
+ * @}
+ */
+
+/** @defgroup Peripheral_declaration
+ @{
+*/
+
+#define TMR2 ((TMR_T *) TMR2_BASE)
+#define TMR3 ((TMR_T *) TMR3_BASE)
+#define TMR4 ((TMR_T *) TMR4_BASE)
+#define TMR5 ((TMR_T *) TMR5_BASE)
+#define TMR6 ((TMR_T *) TMR6_BASE)
+#define TMR7 ((TMR_T *) TMR7_BASE)
+#define TMR12 ((TMR_T *) TMR12_BASE)
+#define TMR13 ((TMR_T *) TMR13_BASE)
+#define TMR14 ((TMR_T *) TMR14_BASE)
+#define RTC ((RTC_T *) RTC_BASE)
+#define WWDT ((WWDT_T *) WWDT_BASE)
+#define IWDT ((IWDT_T *) IWDT_BASE)
+#define I2S2ext ((SPI_T *) I2S2ext_BASE)
+#define SPI2 ((SPI_T *) SPI2_BASE)
+#define SPI3 ((SPI_T *) SPI3_BASE)
+#define I2S3ext ((SPI_T *) I2S3ext_BASE)
+#define USART2 ((USART_T *) USART2_BASE)
+#define USART3 ((USART_T *) USART3_BASE)
+#define UART4 ((USART_T *) UART4_BASE)
+#define UART5 ((USART_T *) UART5_BASE)
+#define I2C1 ((I2C_T *) I2C1_BASE)
+#define I2C2 ((I2C_T *) I2C2_BASE)
+#define I2C3 ((I2C_T *) I2C3_BASE)
+#define CAN1 ((CAN_T *) CAN1_BASE)
+#define CAN2 ((CAN_T *) CAN2_BASE)
+#define PMU ((PMU_T *) PMU_BASE)
+#define DAC ((DAC_T *) DAC_BASE)
+#define UART7 ((USART_T *) UART7_BASE)
+#define UART8 ((USART_T *) UART8_BASE)
+#define UART9 ((USART_T *) UART9_BASE)
+#define UART10 ((USART_T *) UART10_BASE)
+#define TMR1 ((TMR_T *) TMR1_BASE)
+#define TMR8 ((TMR_T *) TMR8_BASE)
+#define USART1 ((USART_T *) USART1_BASE)
+#define USART6 ((USART_T *) USART6_BASE)
+#define ADC ((ADC_Common_T *) ADC_BASE)
+#define ADC1 ((ADC_T *) ADC1_BASE)
+#define ADC2 ((ADC_T *) ADC2_BASE)
+#define ADC3 ((ADC_T *) ADC3_BASE)
+#define SDIO ((SDIO_T *) SDIO_BASE)
+#define SPI1 ((SPI_T *) SPI1_BASE)
+#define SPI4 ((SPI_T *) SPI4_BASE)
+#define SYSCFG ((SYSCFG_T *) SYSCFG_BASE)
+#define EINT ((EINT_T *) EINT_BASE)
+#define TMR9 ((TMR_T *) TMR9_BASE)
+#define TMR10 ((TMR_T *) TMR10_BASE)
+#define TMR11 ((TMR_T *) TMR11_BASE)
+#define SPI5 ((SPI_T *) SPI5_BASE)
+#define SPI6 ((SPI_T *) SPI6_BASE)
+#define SAI1 ((SAI_T *) SAI1_BASE)
+#define SAI1_Block_A ((SAI_Block_T *)SAI1_Block_A_BASE)
+#define SAI1_Block_B ((SAI_Block_T *)SAI1_Block_B_BASE)
+#define GPIOA ((GPIO_T *) GPIOA_BASE)
+#define GPIOB ((GPIO_T *) GPIOB_BASE)
+#define GPIOC ((GPIO_T *) GPIOC_BASE)
+#define GPIOD ((GPIO_T *) GPIOD_BASE)
+#define GPIOE ((GPIO_T *) GPIOE_BASE)
+#define GPIOF ((GPIO_T *) GPIOF_BASE)
+#define GPIOG ((GPIO_T *) GPIOG_BASE)
+#define GPIOH ((GPIO_T *) GPIOH_BASE)
+#define GPIOI ((GPIO_T *) GPIOI_BASE)
+#define GPIOJ ((GPIO_T *) GPIOJ_BASE)
+#define GPIOK ((GPIO_T *) GPIOK_BASE)
+#define CRC ((CRC_T *) CRC_BASE)
+#define RCM ((RCM_T *) RCM_BASE)
+#define FMC ((FMC_T *) FMC_R_BASE)
+#define DMA1 ((DMA_T *) DMA1_BASE)
+#define DMA1_Stream0 ((DMA_Stream_T *) DMA1_Stream0_BASE)
+#define DMA1_Stream1 ((DMA_Stream_T *) DMA1_Stream1_BASE)
+#define DMA1_Stream2 ((DMA_Stream_T *) DMA1_Stream2_BASE)
+#define DMA1_Stream3 ((DMA_Stream_T *) DMA1_Stream3_BASE)
+#define DMA1_Stream4 ((DMA_Stream_T *) DMA1_Stream4_BASE)
+#define DMA1_Stream5 ((DMA_Stream_T *) DMA1_Stream5_BASE)
+#define DMA1_Stream6 ((DMA_Stream_T *) DMA1_Stream6_BASE)
+#define DMA1_Stream7 ((DMA_Stream_T *) DMA1_Stream7_BASE)
+#define DMA2 ((DMA_T *) DMA2_BASE)
+#define DMA2_Stream0 ((DMA_Stream_T *) DMA2_Stream0_BASE)
+#define DMA2_Stream1 ((DMA_Stream_T *) DMA2_Stream1_BASE)
+#define DMA2_Stream2 ((DMA_Stream_T *) DMA2_Stream2_BASE)
+#define DMA2_Stream3 ((DMA_Stream_T *) DMA2_Stream3_BASE)
+#define DMA2_Stream4 ((DMA_Stream_T *) DMA2_Stream4_BASE)
+#define DMA2_Stream5 ((DMA_Stream_T *) DMA2_Stream5_BASE)
+#define DMA2_Stream6 ((DMA_Stream_T *) DMA2_Stream6_BASE)
+#define DMA2_Stream7 ((DMA_Stream_T *) DMA2_Stream7_BASE)
+#define ETH ((ETH_T *) ETH_BASE)
+#define DMA2D ((DMA2D_T *)DMA2D_BASE)
+#define DCI ((DCI_T *) DCI_BASE)
+#define CRYP ((CRYP_T *) CRYP_BASE)
+#define HASH ((HASH_T *) HASH_BASE)
+#define RNG ((RNG_T *) RNG_BASE)
+
+#define SMC_Bank1 ((SMC_Bank1_T *) SMC_Bank1_R_BASE)
+#define SMC_Bank1E ((SMC_Bank1E_T *) SMC_Bank1E_R_BASE)
+#define SMC_Bank2 ((SMC_Bank2_T *) SMC_Bank2_R_BASE)
+#define SMC_Bank3 ((SMC_Bank3_T *) SMC_Bank3_R_BASE)
+#define SMC_Bank4 ((SMC_Bank4_T *) SMC_Bank4_R_BASE)
+
+#define DBGMCU ((DBGMCU_T *) DBGMCU_BASE)
+#define BN ((BN_T *) BN_BASE)
+#define SM3 ((SM3_T *) SM3_BASE)
+#define SM4 ((SM4_T *) SM4_BASE)
+#define DMC ((DMC_T *) DMC_BASE)
+
+/**
+ * @}
+ */
+
+/** @defgroup Exported_Macros
+ @{
+*/
+
+/* Define one bit mask */
+#define BIT0 ((uint32_t)0x00000001)
+#define BIT1 ((uint32_t)0x00000002)
+#define BIT2 ((uint32_t)0x00000004)
+#define BIT3 ((uint32_t)0x00000008)
+#define BIT4 ((uint32_t)0x00000010)
+#define BIT5 ((uint32_t)0x00000020)
+#define BIT6 ((uint32_t)0x00000040)
+#define BIT7 ((uint32_t)0x00000080)
+#define BIT8 ((uint32_t)0x00000100)
+#define BIT9 ((uint32_t)0x00000200)
+#define BIT10 ((uint32_t)0x00000400)
+#define BIT11 ((uint32_t)0x00000800)
+#define BIT12 ((uint32_t)0x00001000)
+#define BIT13 ((uint32_t)0x00002000)
+#define BIT14 ((uint32_t)0x00004000)
+#define BIT15 ((uint32_t)0x00008000)
+#define BIT16 ((uint32_t)0x00010000)
+#define BIT17 ((uint32_t)0x00020000)
+#define BIT18 ((uint32_t)0x00040000)
+#define BIT19 ((uint32_t)0x00080000)
+#define BIT20 ((uint32_t)0x00100000)
+#define BIT21 ((uint32_t)0x00200000)
+#define BIT22 ((uint32_t)0x00400000)
+#define BIT23 ((uint32_t)0x00800000)
+#define BIT24 ((uint32_t)0x01000000)
+#define BIT25 ((uint32_t)0x02000000)
+#define BIT26 ((uint32_t)0x04000000)
+#define BIT27 ((uint32_t)0x08000000)
+#define BIT28 ((uint32_t)0x10000000)
+#define BIT29 ((uint32_t)0x20000000)
+#define BIT30 ((uint32_t)0x40000000)
+#define BIT31 ((uint32_t)0x80000000)
+
+#define SET_BIT(REG, BIT) ((REG) |= (BIT))
+
+#define CLEAR_BIT(REG, BIT) ((REG) &= ~(BIT))
+
+#define READ_BIT(REG, BIT) ((REG) & (BIT))
+
+#define CLEAR_REG(REG) ((REG) = (0x0))
+
+#define WRITE_REG(REG, VAL) ((REG) = (VAL))
+
+#define READ_REG(REG) ((REG))
+
+#define MODIFY_REG(REG, CLEARMASK, SETMASK) WRITE_REG((REG), (((READ_REG(REG)) & (~(CLEARMASK))) | (SETMASK)))
+
+/**
+ * @}
+ */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* __APM32F4XX_H */
+
+/**@} end of group APM32F4xx */
+/**@} end of group CMSIS */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/system_apm32f4xx.h b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/system_apm32f4xx.h
new file mode 100644
index 0000000000..ce73fe260d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Include/system_apm32f4xx.h
@@ -0,0 +1,68 @@
+/*!
+ * @file system_apm32f4xx.h
+ *
+ * @brief CMSIS Cortex-M4 Device Peripheral Access Layer System Source File
+ *
+ * @version V1.0.2
+ *
+ * @date 2022-06-23
+ *
+ * @attention
+ *
+ * Copyright (C) 2021-2022 Geehy Semiconductor
+ *
+ * You may not use this file except in compliance with the
+ * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+ *
+ * The program is only for reference, which is distributed in the hope
+ * that it will be usefull and instructional for customers to develop
+ * their software. Unless required by applicable law or agreed to in
+ * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+ * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+ * and limitations under the License.
+ */
+
+/* Define to prevent recursive inclusion */
+#ifndef __SYSTEM_APM32F4XX_H
+#define __SYSTEM_APM32F4XX_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/** @addtogroup CMSIS
+ @{
+*/
+
+/** @addtogroup APM32F4xx_System
+ @{
+*/
+
+/** @defgroup System_Variables
+ @{
+*/
+
+/* System Clock Frequency (Core Clock) */
+extern uint32_t SystemCoreClock;
+
+/**
+ * @}
+ */
+
+/** @defgroup System_Functions
+ @{
+*/
+
+extern void SystemInit(void);
+extern void SystemCoreClockUpdate(void);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*__SYSTEM_APM32F4XX_H */
+
+/**@} end of group System_Functions */
+/**@} end of group APM32F4xx_System */
+/**@} end of group CMSIS */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f40x.s b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f40x.s
new file mode 100644
index 0000000000..2ee2fe417d
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f40x.s
@@ -0,0 +1,424 @@
+;/*!
+; * @file startup_apm32f40x.s
+; *
+; * @brief CMSIS Cortex-M4 based Core Device Startup File for Device startup_apm32f40x
+; *
+; * @version V1.0.2
+; *
+; * @date 2022-06-23
+; *
+; * @attention
+; *
+; * Copyright (C) 2021-2022 Geehy Semiconductor
+; *
+; * You may not use this file except in compliance with the
+; * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+; *
+; * The program is only for reference, which is distributed in the hope
+; * that it will be usefull and instructional for customers to develop
+; * their software. Unless required by applicable law or agreed to in
+; * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+; * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+; * and limitations under the License.
+; */
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000200
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+ PRESERVE8
+ THUMB
+
+; Vector Table Mapped to Address 0 at Reset
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WWDT_IRQHandler ; Window WatchDog
+ DCD PVD_IRQHandler ; PVD through EINT Line detection
+ DCD TAMP_STAMP_IRQHandler ; Tamper and TimeStamps through the EINT line
+ DCD RTC_WKUP_IRQHandler ; RTC Wakeup through the EINT line
+ DCD FLASH_IRQHandler ; FLASH
+ DCD RCM_IRQHandler ; RCM
+ DCD EINT0_IRQHandler ; EINT Line0
+ DCD EINT1_IRQHandler ; EINT Line1
+ DCD EINT2_IRQHandler ; EINT Line2
+ DCD EINT3_IRQHandler ; EINT Line3
+ DCD EINT4_IRQHandler ; EINT Line4
+ DCD DMA1_STR0_IRQHandler ; DMA1 Stream 0
+ DCD DMA1_STR1_IRQHandler ; DMA1 Stream 1
+ DCD DMA1_STR2_IRQHandler ; DMA1 Stream 2
+ DCD DMA1_STR3_IRQHandler ; DMA1 Stream 3
+ DCD DMA1_STR4_IRQHandler ; DMA1 Stream 4
+ DCD DMA1_STR5_IRQHandler ; DMA1 Stream 5
+ DCD DMA1_STR6_IRQHandler ; DMA1 Stream 6
+ DCD ADC_IRQHandler ; ADC1, ADC2 and ADC3s
+ DCD CAN1_TX_IRQHandler ; CAN1 TX
+ DCD CAN1_RX0_IRQHandler ; CAN1 RX0
+ DCD CAN1_RX1_IRQHandler ; CAN1 RX1
+ DCD CAN1_SCE_IRQHandler ; CAN1 SCE
+ DCD EINT9_5_IRQHandler ; External Line[9:5]s
+ DCD TMR1_BRK_TMR9_IRQHandler ; TMR1 Break and TMR9
+ DCD TMR1_UP_TMR10_IRQHandler ; TMR1 Update and TMR10
+ DCD TMR1_TRG_COM_TMR11_IRQHandler ; TMR1 Trigger and Commutation and TMR11
+ DCD TMR1_CC_IRQHandler ; TMR1 Capture Compare
+ DCD TMR2_IRQHandler ; TMR2
+ DCD TMR3_IRQHandler ; TMR3
+ DCD TMR4_IRQHandler ; TMR4
+ DCD I2C1_EV_IRQHandler ; I2C1 Event
+ DCD I2C1_ER_IRQHandler ; I2C1 Error
+ DCD I2C2_EV_IRQHandler ; I2C2 Event
+ DCD I2C2_ER_IRQHandler ; I2C2 Error
+ DCD SPI1_IRQHandler ; SPI1
+ DCD SPI2_IRQHandler ; SPI2
+ DCD USART1_IRQHandler ; USART1
+ DCD USART2_IRQHandler ; USART2
+ DCD USART3_IRQHandler ; USART3
+ DCD EINT15_10_IRQHandler ; External Line[15:10]s
+ DCD RTC_Alarm_IRQHandler ; RTC Alarm (A and B) through EINT Line
+ DCD OTG_FS_WKUP_IRQHandler ; OTG_FS Wakeup through EINT line
+ DCD TMR8_BRK_TMR12_IRQHandler ; TMR8 Break and TMR12
+ DCD TMR8_UP_TMR13_IRQHandler ; TMR8 Update and TMR13
+ DCD TMR8_TRG_COM_TMR14_IRQHandler ; TMR8 Trigger and Commutation and TMR14
+ DCD TMR8_CC_IRQHandler ; TMR8 Capture Compare
+ DCD DMA1_STR7_IRQHandler ; DMA1 Stream 7
+ DCD EMMC_IRQHandler ; EMMC
+ DCD SDIO_IRQHandler ; SDIO
+ DCD TMR5_IRQHandler ; TMR5
+ DCD SPI3_IRQHandler ; SPI3
+ DCD UART4_IRQHandler ; UART4
+ DCD UART5_IRQHandler ; UART5
+ DCD TMR6_DAC_IRQHandler ; TMR6 and DAC1&2 underrun errors
+ DCD TMR7_IRQHandler ; TMR7
+ DCD DMA2_STR0_IRQHandler ; DMA2 Stream 0
+ DCD DMA2_STR1_IRQHandler ; DMA2 Stream 1
+ DCD DMA2_STR2_IRQHandler ; DMA2 Stream 2
+ DCD DMA2_STR3_IRQHandler ; DMA2 Stream 3
+ DCD DMA2_STR4_IRQHandler ; DMA2 Stream 4
+ DCD ETH_IRQHandler ; Ethernet
+ DCD ETH_WKUP_IRQHandler ; Ethernet Wakeup through EINT line
+ DCD CAN2_TX_IRQHandler ; CAN2 TX
+ DCD CAN2_RX0_IRQHandler ; CAN2 RX0
+ DCD CAN2_RX1_IRQHandler ; CAN2 RX1
+ DCD CAN2_SCE_IRQHandler ; CAN2 SCE
+ DCD OTG_FS_IRQHandler ; OTG_FS
+ DCD DMA2_STR5_IRQHandler ; DMA2 Stream 5
+ DCD DMA2_STR6_IRQHandler ; DMA2 Stream 6
+ DCD DMA2_STR7_IRQHandler ; DMA2 Stream 7
+ DCD USART6_IRQHandler ; USART6
+ DCD I2C3_EV_IRQHandler ; I2C3 event
+ DCD I2C3_ER_IRQHandler ; I2C3 error
+ DCD OTG_HS1_EP1_OUT_IRQHandler ; OTG_HS1 End Point 1 Out
+ DCD OTG_HS1_EP1_IN_IRQHandler ; OTG_HS1 End Point 1 In
+ DCD OTG_HS1_WKUP_IRQHandler ; OTG_HS1 Wakeup through EINT
+ DCD OTG_HS1_IRQHandler ; OTG_HS1
+ DCD DCI_IRQHandler ; DCI
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD FPU_IRQHandler ; FPU
+ DCD SM3_IRQHandler ; SM3
+ DCD SM4_IRQHandler ; SM4
+ DCD BN_IRQHandler ; BN
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+; Reset handler
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WWDT_IRQHandler [WEAK]
+ EXPORT PVD_IRQHandler [WEAK]
+ EXPORT TAMP_STAMP_IRQHandler [WEAK]
+ EXPORT RTC_WKUP_IRQHandler [WEAK]
+ EXPORT FLASH_IRQHandler [WEAK]
+ EXPORT RCM_IRQHandler [WEAK]
+ EXPORT EINT0_IRQHandler [WEAK]
+ EXPORT EINT1_IRQHandler [WEAK]
+ EXPORT EINT2_IRQHandler [WEAK]
+ EXPORT EINT3_IRQHandler [WEAK]
+ EXPORT EINT4_IRQHandler [WEAK]
+ EXPORT DMA1_STR0_IRQHandler [WEAK]
+ EXPORT DMA1_STR1_IRQHandler [WEAK]
+ EXPORT DMA1_STR2_IRQHandler [WEAK]
+ EXPORT DMA1_STR3_IRQHandler [WEAK]
+ EXPORT DMA1_STR4_IRQHandler [WEAK]
+ EXPORT DMA1_STR5_IRQHandler [WEAK]
+ EXPORT DMA1_STR6_IRQHandler [WEAK]
+ EXPORT ADC_IRQHandler [WEAK]
+ EXPORT CAN1_TX_IRQHandler [WEAK]
+ EXPORT CAN1_RX0_IRQHandler [WEAK]
+ EXPORT CAN1_RX1_IRQHandler [WEAK]
+ EXPORT CAN1_SCE_IRQHandler [WEAK]
+ EXPORT EINT9_5_IRQHandler [WEAK]
+ EXPORT TMR1_BRK_TMR9_IRQHandler [WEAK]
+ EXPORT TMR1_UP_TMR10_IRQHandler [WEAK]
+ EXPORT TMR1_TRG_COM_TMR11_IRQHandler [WEAK]
+ EXPORT TMR1_CC_IRQHandler [WEAK]
+ EXPORT TMR2_IRQHandler [WEAK]
+ EXPORT TMR3_IRQHandler [WEAK]
+ EXPORT TMR4_IRQHandler [WEAK]
+ EXPORT I2C1_EV_IRQHandler [WEAK]
+ EXPORT I2C1_ER_IRQHandler [WEAK]
+ EXPORT I2C2_EV_IRQHandler [WEAK]
+ EXPORT I2C2_ER_IRQHandler [WEAK]
+ EXPORT SPI1_IRQHandler [WEAK]
+ EXPORT SPI2_IRQHandler [WEAK]
+ EXPORT USART1_IRQHandler [WEAK]
+ EXPORT USART2_IRQHandler [WEAK]
+ EXPORT USART3_IRQHandler [WEAK]
+ EXPORT EINT15_10_IRQHandler [WEAK]
+ EXPORT RTC_Alarm_IRQHandler [WEAK]
+ EXPORT OTG_FS_WKUP_IRQHandler [WEAK]
+ EXPORT TMR8_BRK_TMR12_IRQHandler [WEAK]
+ EXPORT TMR8_UP_TMR13_IRQHandler [WEAK]
+ EXPORT TMR8_TRG_COM_TMR14_IRQHandler [WEAK]
+ EXPORT TMR8_CC_IRQHandler [WEAK]
+ EXPORT DMA1_STR7_IRQHandler [WEAK]
+ EXPORT EMMC_IRQHandler [WEAK]
+ EXPORT SDIO_IRQHandler [WEAK]
+ EXPORT TMR5_IRQHandler [WEAK]
+ EXPORT SPI3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT UART5_IRQHandler [WEAK]
+ EXPORT TMR6_DAC_IRQHandler [WEAK]
+ EXPORT TMR7_IRQHandler [WEAK]
+ EXPORT DMA2_STR0_IRQHandler [WEAK]
+ EXPORT DMA2_STR1_IRQHandler [WEAK]
+ EXPORT DMA2_STR2_IRQHandler [WEAK]
+ EXPORT DMA2_STR3_IRQHandler [WEAK]
+ EXPORT DMA2_STR4_IRQHandler [WEAK]
+ EXPORT ETH_IRQHandler [WEAK]
+ EXPORT ETH_WKUP_IRQHandler [WEAK]
+ EXPORT CAN2_TX_IRQHandler [WEAK]
+ EXPORT CAN2_RX0_IRQHandler [WEAK]
+ EXPORT CAN2_RX1_IRQHandler [WEAK]
+ EXPORT CAN2_SCE_IRQHandler [WEAK]
+ EXPORT OTG_FS_IRQHandler [WEAK]
+ EXPORT DMA2_STR5_IRQHandler [WEAK]
+ EXPORT DMA2_STR6_IRQHandler [WEAK]
+ EXPORT DMA2_STR7_IRQHandler [WEAK]
+ EXPORT USART6_IRQHandler [WEAK]
+ EXPORT I2C3_EV_IRQHandler [WEAK]
+ EXPORT I2C3_ER_IRQHandler [WEAK]
+ EXPORT OTG_HS1_EP1_OUT_IRQHandler [WEAK]
+ EXPORT OTG_HS1_EP1_IN_IRQHandler [WEAK]
+ EXPORT OTG_HS1_WKUP_IRQHandler [WEAK]
+ EXPORT OTG_HS1_IRQHandler [WEAK]
+ EXPORT DCI_IRQHandler [WEAK]
+ EXPORT FPU_IRQHandler [WEAK]
+ EXPORT SM3_IRQHandler [WEAK]
+ EXPORT SM4_IRQHandler [WEAK]
+ EXPORT BN_IRQHandler [WEAK]
+
+WWDT_IRQHandler
+PVD_IRQHandler
+TAMP_STAMP_IRQHandler
+RTC_WKUP_IRQHandler
+FLASH_IRQHandler
+RCM_IRQHandler
+EINT0_IRQHandler
+EINT1_IRQHandler
+EINT2_IRQHandler
+EINT3_IRQHandler
+EINT4_IRQHandler
+DMA1_STR0_IRQHandler
+DMA1_STR1_IRQHandler
+DMA1_STR2_IRQHandler
+DMA1_STR3_IRQHandler
+DMA1_STR4_IRQHandler
+DMA1_STR5_IRQHandler
+DMA1_STR6_IRQHandler
+ADC_IRQHandler
+CAN1_TX_IRQHandler
+CAN1_RX0_IRQHandler
+CAN1_RX1_IRQHandler
+CAN1_SCE_IRQHandler
+EINT9_5_IRQHandler
+TMR1_BRK_TMR9_IRQHandler
+TMR1_UP_TMR10_IRQHandler
+TMR1_TRG_COM_TMR11_IRQHandler
+TMR1_CC_IRQHandler
+TMR2_IRQHandler
+TMR3_IRQHandler
+TMR4_IRQHandler
+I2C1_EV_IRQHandler
+I2C1_ER_IRQHandler
+I2C2_EV_IRQHandler
+I2C2_ER_IRQHandler
+SPI1_IRQHandler
+SPI2_IRQHandler
+USART1_IRQHandler
+USART2_IRQHandler
+USART3_IRQHandler
+EINT15_10_IRQHandler
+RTC_Alarm_IRQHandler
+OTG_FS_WKUP_IRQHandler
+TMR8_BRK_TMR12_IRQHandler
+TMR8_UP_TMR13_IRQHandler
+TMR8_TRG_COM_TMR14_IRQHandler
+TMR8_CC_IRQHandler
+DMA1_STR7_IRQHandler
+EMMC_IRQHandler
+SDIO_IRQHandler
+TMR5_IRQHandler
+SPI3_IRQHandler
+UART4_IRQHandler
+UART5_IRQHandler
+TMR6_DAC_IRQHandler
+TMR7_IRQHandler
+DMA2_STR0_IRQHandler
+DMA2_STR1_IRQHandler
+DMA2_STR2_IRQHandler
+DMA2_STR3_IRQHandler
+DMA2_STR4_IRQHandler
+ETH_IRQHandler
+ETH_WKUP_IRQHandler
+CAN2_TX_IRQHandler
+CAN2_RX0_IRQHandler
+CAN2_RX1_IRQHandler
+CAN2_SCE_IRQHandler
+OTG_FS_IRQHandler
+DMA2_STR5_IRQHandler
+DMA2_STR6_IRQHandler
+DMA2_STR7_IRQHandler
+USART6_IRQHandler
+I2C3_EV_IRQHandler
+I2C3_ER_IRQHandler
+OTG_HS1_EP1_OUT_IRQHandler
+OTG_HS1_EP1_IN_IRQHandler
+OTG_HS1_WKUP_IRQHandler
+OTG_HS1_IRQHandler
+DCI_IRQHandler
+FPU_IRQHandler
+SM3_IRQHandler
+SM4_IRQHandler
+BN_IRQHandler
+
+ B .
+
+ ENDP
+
+ ALIGN
+
+;*******************************************************************************
+; User Stack and Heap initialization
+;*******************************************************************************
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap
+
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+
+ ALIGN
+
+ ENDIF
+
+ END
+
+;************************ (C) COPYRIGHT Geehy Semiconductor Co.,Ltd *****END OF FILE*****
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f41x.s b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f41x.s
new file mode 100644
index 0000000000..a350e0a60c
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/arm/startup_apm32f41x.s
@@ -0,0 +1,427 @@
+;/*!
+; * @file startup_apm32f40x.s
+; *
+; * @brief CMSIS Cortex-M4 based Core Device Startup File for Device startup_apm32f40x
+; *
+; * @version V1.0.2
+; *
+; * @date 2022-06-23
+; *
+; * @attention
+; *
+; * Copyright (C) 2021-2022 Geehy Semiconductor
+; *
+; * You may not use this file except in compliance with the
+; * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+; *
+; * The program is only for reference, which is distributed in the hope
+; * that it will be usefull and instructional for customers to develop
+; * their software. Unless required by applicable law or agreed to in
+; * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+; * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+; * and limitations under the License.
+; */
+
+; Stack Configuration
+; Stack Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Stack_Size EQU 0x00000400
+
+ AREA STACK, NOINIT, READWRITE, ALIGN=3
+Stack_Mem SPACE Stack_Size
+__initial_sp
+
+; Heap Configuration
+; Heap Size (in Bytes) <0x0-0xFFFFFFFF:8>
+;
+
+Heap_Size EQU 0x00000200
+
+ AREA HEAP, NOINIT, READWRITE, ALIGN=3
+__heap_base
+Heap_Mem SPACE Heap_Size
+__heap_limit
+
+ PRESERVE8
+ THUMB
+
+; Vector Table Mapped to Address 0 at Reset
+ AREA RESET, DATA, READONLY
+ EXPORT __Vectors
+ EXPORT __Vectors_End
+ EXPORT __Vectors_Size
+
+__Vectors DCD __initial_sp ; Top of Stack
+ DCD Reset_Handler ; Reset Handler
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WWDT_IRQHandler ; Window WatchDog
+ DCD PVD_IRQHandler ; PVD through EINT Line detection
+ DCD TAMP_STAMP_IRQHandler ; Tamper and TimeStamps through the EINT line
+ DCD RTC_WKUP_IRQHandler ; RTC Wakeup through the EINT line
+ DCD FLASH_IRQHandler ; FLASH
+ DCD RCM_IRQHandler ; RCM
+ DCD EINT0_IRQHandler ; EINT Line0
+ DCD EINT1_IRQHandler ; EINT Line1
+ DCD EINT2_IRQHandler ; EINT Line2
+ DCD EINT3_IRQHandler ; EINT Line3
+ DCD EINT4_IRQHandler ; EINT Line4
+ DCD DMA1_STR0_IRQHandler ; DMA1 Stream 0
+ DCD DMA1_STR1_IRQHandler ; DMA1 Stream 1
+ DCD DMA1_STR2_IRQHandler ; DMA1 Stream 2
+ DCD DMA1_STR3_IRQHandler ; DMA1 Stream 3
+ DCD DMA1_STR4_IRQHandler ; DMA1 Stream 4
+ DCD DMA1_STR5_IRQHandler ; DMA1 Stream 5
+ DCD DMA1_STR6_IRQHandler ; DMA1 Stream 6
+ DCD ADC_IRQHandler ; ADC1, ADC2 and ADC3s
+ DCD CAN1_TX_IRQHandler ; CAN1 TX
+ DCD CAN1_RX0_IRQHandler ; CAN1 RX0
+ DCD CAN1_RX1_IRQHandler ; CAN1 RX1
+ DCD CAN1_SCE_IRQHandler ; CAN1 SCE
+ DCD EINT9_5_IRQHandler ; External Line[9:5]s
+ DCD TMR1_BRK_TMR9_IRQHandler ; TMR1 Break and TMR9
+ DCD TMR1_UP_TMR10_IRQHandler ; TMR1 Update and TMR10
+ DCD TMR1_TRG_COM_TMR11_IRQHandler ; TMR1 Trigger and Commutation and TMR11
+ DCD TMR1_CC_IRQHandler ; TMR1 Capture Compare
+ DCD TMR2_IRQHandler ; TMR2
+ DCD TMR3_IRQHandler ; TMR3
+ DCD TMR4_IRQHandler ; TMR4
+ DCD I2C1_EV_IRQHandler ; I2C1 Event
+ DCD I2C1_ER_IRQHandler ; I2C1 Error
+ DCD I2C2_EV_IRQHandler ; I2C2 Event
+ DCD I2C2_ER_IRQHandler ; I2C2 Error
+ DCD SPI1_IRQHandler ; SPI1
+ DCD SPI2_IRQHandler ; SPI2
+ DCD USART1_IRQHandler ; USART1
+ DCD USART2_IRQHandler ; USART2
+ DCD USART3_IRQHandler ; USART3
+ DCD EINT15_10_IRQHandler ; External Line[15:10]s
+ DCD RTC_Alarm_IRQHandler ; RTC Alarm (A and B) through EINT Line
+ DCD OTG_FS_WKUP_IRQHandler ; OTG_FS Wakeup through EINT line
+ DCD TMR8_BRK_TMR12_IRQHandler ; TMR8 Break and TMR12
+ DCD TMR8_UP_TMR13_IRQHandler ; TMR8 Update and TMR13
+ DCD TMR8_TRG_COM_TMR14_IRQHandler ; TMR8 Trigger and Commutation and TMR14
+ DCD TMR8_CC_IRQHandler ; TMR8 Capture Compare
+ DCD DMA1_STR7_IRQHandler ; DMA1 Stream 7
+ DCD EMMC_IRQHandler ; EMMC
+ DCD SDIO_IRQHandler ; SDIO
+ DCD TMR5_IRQHandler ; TMR5
+ DCD SPI3_IRQHandler ; SPI3
+ DCD UART4_IRQHandler ; UART4
+ DCD UART5_IRQHandler ; UART5
+ DCD TMR6_DAC_IRQHandler ; TMR6 and DAC1&2 underrun errors
+ DCD TMR7_IRQHandler ; TMR7
+ DCD DMA2_STR0_IRQHandler ; DMA2 Stream 0
+ DCD DMA2_STR1_IRQHandler ; DMA2 Stream 1
+ DCD DMA2_STR2_IRQHandler ; DMA2 Stream 2
+ DCD DMA2_STR3_IRQHandler ; DMA2 Stream 3
+ DCD DMA2_STR4_IRQHandler ; DMA2 Stream 4
+ DCD ETH_IRQHandler ; Ethernet
+ DCD ETH_WKUP_IRQHandler ; Ethernet Wakeup through EINT line
+ DCD CAN2_TX_IRQHandler ; CAN2 TX
+ DCD CAN2_RX0_IRQHandler ; CAN2 RX0
+ DCD CAN2_RX1_IRQHandler ; CAN2 RX1
+ DCD CAN2_SCE_IRQHandler ; CAN2 SCE
+ DCD OTG_FS_IRQHandler ; OTG_FS
+ DCD DMA2_STR5_IRQHandler ; DMA2 Stream 5
+ DCD DMA2_STR6_IRQHandler ; DMA2 Stream 6
+ DCD DMA2_STR7_IRQHandler ; DMA2 Stream 7
+ DCD USART6_IRQHandler ; USART6
+ DCD I2C3_EV_IRQHandler ; I2C3 event
+ DCD I2C3_ER_IRQHandler ; I2C3 error
+ DCD OTG_HS1_EP1_OUT_IRQHandler ; OTG_HS1 End Point 1 Out
+ DCD OTG_HS1_EP1_IN_IRQHandler ; OTG_HS1 End Point 1 In
+ DCD OTG_HS1_WKUP_IRQHandler ; OTG_HS1 Wakeup through EINT
+ DCD OTG_HS1_IRQHandler ; OTG_HS1
+ DCD DCI_IRQHandler ; DCI
+ DCD CRYP_IRQHandler ; CRYP crypto
+ DCD HASH_RNG_IRQHandler ; Hash and Rng
+ DCD FPU_IRQHandler ; FPU
+ DCD SM3_IRQHandler ; SM3
+ DCD SM4_IRQHandler ; SM4
+ DCD BN_IRQHandler ; BN
+__Vectors_End
+
+__Vectors_Size EQU __Vectors_End - __Vectors
+
+ AREA |.text|, CODE, READONLY
+
+; Reset handler
+Reset_Handler PROC
+ EXPORT Reset_Handler [WEAK]
+ IMPORT SystemInit
+ IMPORT __main
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__main
+ BX R0
+ ENDP
+
+; Dummy Exception Handlers (infinite loops which can be modified)
+
+NMI_Handler PROC
+ EXPORT NMI_Handler [WEAK]
+ B .
+ ENDP
+HardFault_Handler\
+ PROC
+ EXPORT HardFault_Handler [WEAK]
+ B .
+ ENDP
+MemManage_Handler\
+ PROC
+ EXPORT MemManage_Handler [WEAK]
+ B .
+ ENDP
+BusFault_Handler\
+ PROC
+ EXPORT BusFault_Handler [WEAK]
+ B .
+ ENDP
+UsageFault_Handler\
+ PROC
+ EXPORT UsageFault_Handler [WEAK]
+ B .
+ ENDP
+SVC_Handler PROC
+ EXPORT SVC_Handler [WEAK]
+ B .
+ ENDP
+DebugMon_Handler\
+ PROC
+ EXPORT DebugMon_Handler [WEAK]
+ B .
+ ENDP
+PendSV_Handler PROC
+ EXPORT PendSV_Handler [WEAK]
+ B .
+ ENDP
+SysTick_Handler PROC
+ EXPORT SysTick_Handler [WEAK]
+ B .
+ ENDP
+
+Default_Handler PROC
+
+ EXPORT WWDT_IRQHandler [WEAK]
+ EXPORT PVD_IRQHandler [WEAK]
+ EXPORT TAMP_STAMP_IRQHandler [WEAK]
+ EXPORT RTC_WKUP_IRQHandler [WEAK]
+ EXPORT FLASH_IRQHandler [WEAK]
+ EXPORT RCM_IRQHandler [WEAK]
+ EXPORT EINT0_IRQHandler [WEAK]
+ EXPORT EINT1_IRQHandler [WEAK]
+ EXPORT EINT2_IRQHandler [WEAK]
+ EXPORT EINT3_IRQHandler [WEAK]
+ EXPORT EINT4_IRQHandler [WEAK]
+ EXPORT DMA1_STR0_IRQHandler [WEAK]
+ EXPORT DMA1_STR1_IRQHandler [WEAK]
+ EXPORT DMA1_STR2_IRQHandler [WEAK]
+ EXPORT DMA1_STR3_IRQHandler [WEAK]
+ EXPORT DMA1_STR4_IRQHandler [WEAK]
+ EXPORT DMA1_STR5_IRQHandler [WEAK]
+ EXPORT DMA1_STR6_IRQHandler [WEAK]
+ EXPORT ADC_IRQHandler [WEAK]
+ EXPORT CAN1_TX_IRQHandler [WEAK]
+ EXPORT CAN1_RX0_IRQHandler [WEAK]
+ EXPORT CAN1_RX1_IRQHandler [WEAK]
+ EXPORT CAN1_SCE_IRQHandler [WEAK]
+ EXPORT EINT9_5_IRQHandler [WEAK]
+ EXPORT TMR1_BRK_TMR9_IRQHandler [WEAK]
+ EXPORT TMR1_UP_TMR10_IRQHandler [WEAK]
+ EXPORT TMR1_TRG_COM_TMR11_IRQHandler [WEAK]
+ EXPORT TMR1_CC_IRQHandler [WEAK]
+ EXPORT TMR2_IRQHandler [WEAK]
+ EXPORT TMR3_IRQHandler [WEAK]
+ EXPORT TMR4_IRQHandler [WEAK]
+ EXPORT I2C1_EV_IRQHandler [WEAK]
+ EXPORT I2C1_ER_IRQHandler [WEAK]
+ EXPORT I2C2_EV_IRQHandler [WEAK]
+ EXPORT I2C2_ER_IRQHandler [WEAK]
+ EXPORT SPI1_IRQHandler [WEAK]
+ EXPORT SPI2_IRQHandler [WEAK]
+ EXPORT USART1_IRQHandler [WEAK]
+ EXPORT USART2_IRQHandler [WEAK]
+ EXPORT USART3_IRQHandler [WEAK]
+ EXPORT EINT15_10_IRQHandler [WEAK]
+ EXPORT RTC_Alarm_IRQHandler [WEAK]
+ EXPORT OTG_FS_WKUP_IRQHandler [WEAK]
+ EXPORT TMR8_BRK_TMR12_IRQHandler [WEAK]
+ EXPORT TMR8_UP_TMR13_IRQHandler [WEAK]
+ EXPORT TMR8_TRG_COM_TMR14_IRQHandler [WEAK]
+ EXPORT TMR8_CC_IRQHandler [WEAK]
+ EXPORT DMA1_STR7_IRQHandler [WEAK]
+ EXPORT EMMC_IRQHandler [WEAK]
+ EXPORT SDIO_IRQHandler [WEAK]
+ EXPORT TMR5_IRQHandler [WEAK]
+ EXPORT SPI3_IRQHandler [WEAK]
+ EXPORT UART4_IRQHandler [WEAK]
+ EXPORT UART5_IRQHandler [WEAK]
+ EXPORT TMR6_DAC_IRQHandler [WEAK]
+ EXPORT TMR7_IRQHandler [WEAK]
+ EXPORT DMA2_STR0_IRQHandler [WEAK]
+ EXPORT DMA2_STR1_IRQHandler [WEAK]
+ EXPORT DMA2_STR2_IRQHandler [WEAK]
+ EXPORT DMA2_STR3_IRQHandler [WEAK]
+ EXPORT DMA2_STR4_IRQHandler [WEAK]
+ EXPORT ETH_IRQHandler [WEAK]
+ EXPORT ETH_WKUP_IRQHandler [WEAK]
+ EXPORT CAN2_TX_IRQHandler [WEAK]
+ EXPORT CAN2_RX0_IRQHandler [WEAK]
+ EXPORT CAN2_RX1_IRQHandler [WEAK]
+ EXPORT CAN2_SCE_IRQHandler [WEAK]
+ EXPORT OTG_FS_IRQHandler [WEAK]
+ EXPORT DMA2_STR5_IRQHandler [WEAK]
+ EXPORT DMA2_STR6_IRQHandler [WEAK]
+ EXPORT DMA2_STR7_IRQHandler [WEAK]
+ EXPORT USART6_IRQHandler [WEAK]
+ EXPORT I2C3_EV_IRQHandler [WEAK]
+ EXPORT I2C3_ER_IRQHandler [WEAK]
+ EXPORT OTG_HS1_EP1_OUT_IRQHandler [WEAK]
+ EXPORT OTG_HS1_EP1_IN_IRQHandler [WEAK]
+ EXPORT OTG_HS1_WKUP_IRQHandler [WEAK]
+ EXPORT OTG_HS1_IRQHandler [WEAK]
+ EXPORT DCI_IRQHandler [WEAK]
+ EXPORT CRYP_IRQHandler [WEAK]
+ EXPORT HASH_RNG_IRQHandler [WEAK]
+ EXPORT FPU_IRQHandler [WEAK]
+ EXPORT SM3_IRQHandler [WEAK]
+ EXPORT SM4_IRQHandler [WEAK]
+ EXPORT BN_IRQHandler [WEAK]
+WWDT_IRQHandler
+PVD_IRQHandler
+TAMP_STAMP_IRQHandler
+RTC_WKUP_IRQHandler
+FLASH_IRQHandler
+RCM_IRQHandler
+EINT0_IRQHandler
+EINT1_IRQHandler
+EINT2_IRQHandler
+EINT3_IRQHandler
+EINT4_IRQHandler
+DMA1_STR0_IRQHandler
+DMA1_STR1_IRQHandler
+DMA1_STR2_IRQHandler
+DMA1_STR3_IRQHandler
+DMA1_STR4_IRQHandler
+DMA1_STR5_IRQHandler
+DMA1_STR6_IRQHandler
+ADC_IRQHandler
+CAN1_TX_IRQHandler
+CAN1_RX0_IRQHandler
+CAN1_RX1_IRQHandler
+CAN1_SCE_IRQHandler
+EINT9_5_IRQHandler
+TMR1_BRK_TMR9_IRQHandler
+TMR1_UP_TMR10_IRQHandler
+TMR1_TRG_COM_TMR11_IRQHandler
+TMR1_CC_IRQHandler
+TMR2_IRQHandler
+TMR3_IRQHandler
+TMR4_IRQHandler
+I2C1_EV_IRQHandler
+I2C1_ER_IRQHandler
+I2C2_EV_IRQHandler
+I2C2_ER_IRQHandler
+SPI1_IRQHandler
+SPI2_IRQHandler
+USART1_IRQHandler
+USART2_IRQHandler
+USART3_IRQHandler
+EINT15_10_IRQHandler
+RTC_Alarm_IRQHandler
+OTG_FS_WKUP_IRQHandler
+TMR8_BRK_TMR12_IRQHandler
+TMR8_UP_TMR13_IRQHandler
+TMR8_TRG_COM_TMR14_IRQHandler
+TMR8_CC_IRQHandler
+DMA1_STR7_IRQHandler
+EMMC_IRQHandler
+SDIO_IRQHandler
+TMR5_IRQHandler
+SPI3_IRQHandler
+UART4_IRQHandler
+UART5_IRQHandler
+TMR6_DAC_IRQHandler
+TMR7_IRQHandler
+DMA2_STR0_IRQHandler
+DMA2_STR1_IRQHandler
+DMA2_STR2_IRQHandler
+DMA2_STR3_IRQHandler
+DMA2_STR4_IRQHandler
+ETH_IRQHandler
+ETH_WKUP_IRQHandler
+CAN2_TX_IRQHandler
+CAN2_RX0_IRQHandler
+CAN2_RX1_IRQHandler
+CAN2_SCE_IRQHandler
+OTG_FS_IRQHandler
+DMA2_STR5_IRQHandler
+DMA2_STR6_IRQHandler
+DMA2_STR7_IRQHandler
+USART6_IRQHandler
+I2C3_EV_IRQHandler
+I2C3_ER_IRQHandler
+OTG_HS1_EP1_OUT_IRQHandler
+OTG_HS1_EP1_IN_IRQHandler
+OTG_HS1_WKUP_IRQHandler
+OTG_HS1_IRQHandler
+DCI_IRQHandler
+CRYP_IRQHandler
+HASH_RNG_IRQHandler
+FPU_IRQHandler
+SM3_IRQHandler
+SM4_IRQHandler
+BN_IRQHandler
+
+ B .
+
+ ENDP
+
+ ALIGN
+
+;*******************************************************************************
+; User Stack and Heap initialization
+;*******************************************************************************
+ IF :DEF:__MICROLIB
+
+ EXPORT __initial_sp
+ EXPORT __heap_base
+ EXPORT __heap_limit
+
+ ELSE
+
+ IMPORT __use_two_region_memory
+ EXPORT __user_initial_stackheap
+
+__user_initial_stackheap
+
+ LDR R0, = Heap_Mem
+ LDR R1, =(Stack_Mem + Stack_Size)
+ LDR R2, = (Heap_Mem + Heap_Size)
+ LDR R3, = Stack_Mem
+ BX LR
+
+ ALIGN
+
+ ENDIF
+
+ END
+
+;************************ (C) COPYRIGHT Geehy Semiconductor Co.,Ltd *****END OF FILE*****
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f40x.s b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f40x.s
new file mode 100644
index 0000000000..befb78b012
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f40x.s
@@ -0,0 +1,618 @@
+;/*!
+; * @file startup_apm32f40x.s
+; *
+; * @brief CMSIS Cortex-M4 based Core Device Startup File for Device startup_apm32f40x
+; *
+; * @version V1.0.2
+; *
+; * @date 2022-06-23
+; *
+; * @attention
+; *
+; * Copyright (C) 2021-2022 Geehy Semiconductor
+; *
+; * You may not use this file except in compliance with the
+; * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+; *
+; * The program is only for reference, which is distributed in the hope
+; * that it will be usefull and instructional for customers to develop
+; * their software. Unless required by applicable law or agreed to in
+; * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+; * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+; * and limitations under the License.
+; */
+
+ MODULE ?cstartup
+
+ ;; Forward declaration of sections.
+ SECTION CSTACK:DATA:NOROOT(3)
+
+ SECTION .intvec:CODE:NOROOT(2)
+
+ EXTERN __iar_program_start
+ EXTERN SystemInit
+ PUBLIC __vector_table
+
+ DATA
+__vector_table
+ DCD sfe(CSTACK)
+ DCD Reset_Handler ; Reset Handler
+
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WWDT_IRQHandler ; Window WatchDog
+ DCD PVD_IRQHandler ; PVD through EINT Line detection
+ DCD TAMP_STAMP_IRQHandler ; Tamper and TimeStamps through the EINT line
+ DCD RTC_WKUP_IRQHandler ; RTC Wakeup through the EINT line
+ DCD FLASH_IRQHandler ; FLASH
+ DCD RCM_IRQHandler ; RCM
+ DCD EINT0_IRQHandler ; EINT Line0
+ DCD EINT1_IRQHandler ; EINT Line1
+ DCD EINT2_IRQHandler ; EINT Line2
+ DCD EINT3_IRQHandler ; EINT Line3
+ DCD EINT4_IRQHandler ; EINT Line4
+ DCD DMA1_STR0_IRQHandler ; DMA1 Stream 0
+ DCD DMA1_STR1_IRQHandler ; DMA1 Stream 1
+ DCD DMA1_STR2_IRQHandler ; DMA1 Stream 2
+ DCD DMA1_STR3_IRQHandler ; DMA1 Stream 3
+ DCD DMA1_STR4_IRQHandler ; DMA1 Stream 4
+ DCD DMA1_STR5_IRQHandler ; DMA1 Stream 5
+ DCD DMA1_STR6_IRQHandler ; DMA1 Stream 6
+ DCD ADC_IRQHandler ; ADC1, ADC2 and ADC3s
+ DCD CAN1_TX_IRQHandler ; CAN1 TX
+ DCD CAN1_RX0_IRQHandler ; CAN1 RX0
+ DCD CAN1_RX1_IRQHandler ; CAN1 RX1
+ DCD CAN1_SCE_IRQHandler ; CAN1 SCE
+ DCD EINT9_5_IRQHandler ; External Line[9:5]s
+ DCD TMR1_BRK_TMR9_IRQHandler ; TMR1 Break and TMR9
+ DCD TMR1_UP_TMR10_IRQHandler ; TMR1 Update and TMR10
+ DCD TMR1_TRG_COM_TMR11_IRQHandler ; TMR1 Trigger and Commutation and TMR11
+ DCD TMR1_CC_IRQHandler ; TMR1 Capture Compare
+ DCD TMR2_IRQHandler ; TMR2
+ DCD TMR3_IRQHandler ; TMR3
+ DCD TMR4_IRQHandler ; TMR4
+ DCD I2C1_EV_IRQHandler ; I2C1 Event
+ DCD I2C1_ER_IRQHandler ; I2C1 Error
+ DCD I2C2_EV_IRQHandler ; I2C2 Event
+ DCD I2C2_ER_IRQHandler ; I2C2 Error
+ DCD SPI1_IRQHandler ; SPI1
+ DCD SPI2_IRQHandler ; SPI2
+ DCD USART1_IRQHandler ; USART1
+ DCD USART2_IRQHandler ; USART2
+ DCD USART3_IRQHandler ; USART3
+ DCD EINT15_10_IRQHandler ; External Line[15:10]s
+ DCD RTC_Alarm_IRQHandler ; RTC Alarm (A and B) through EINT Line
+ DCD OTG_FS_WKUP_IRQHandler ; OTG_FS Wakeup through EINT line
+ DCD TMR8_BRK_TMR12_IRQHandler ; TMR8 Break and TMR12
+ DCD TMR8_UP_TMR13_IRQHandler ; TMR8 Update and TMR13
+ DCD TMR8_TRG_COM_TMR14_IRQHandler ; TMR8 Trigger and Commutation and TMR14
+ DCD TMR8_CC_IRQHandler ; TMR8 Capture Compare
+ DCD DMA1_STR7_IRQHandler ; DMA1 Stream 7
+ DCD EMMC_IRQHandler ; EMMC
+ DCD SDIO_IRQHandler ; SDIO
+ DCD TMR5_IRQHandler ; TMR5
+ DCD SPI3_IRQHandler ; SPI3
+ DCD UART4_IRQHandler ; UART4
+ DCD UART5_IRQHandler ; UART5
+ DCD TMR6_DAC_IRQHandler ; TMR6 and DAC1&2 underrun errors
+ DCD TMR7_IRQHandler ; TMR7
+ DCD DMA2_STR0_IRQHandler ; DMA2 Stream 0
+ DCD DMA2_STR1_IRQHandler ; DMA2 Stream 1
+ DCD DMA2_STR2_IRQHandler ; DMA2 Stream 2
+ DCD DMA2_STR3_IRQHandler ; DMA2 Stream 3
+ DCD DMA2_STR4_IRQHandler ; DMA2 Stream 4
+ DCD ETH_IRQHandler ; Ethernet
+ DCD ETH_WKUP_IRQHandler ; Ethernet Wakeup through EINT line
+ DCD CAN2_TX_IRQHandler ; CAN2 TX
+ DCD CAN2_RX0_IRQHandler ; CAN2 RX0
+ DCD CAN2_RX1_IRQHandler ; CAN2 RX1
+ DCD CAN2_SCE_IRQHandler ; CAN2 SCE
+ DCD OTG_FS_IRQHandler ; OTG_FS
+ DCD DMA2_STR5_IRQHandler ; DMA2 Stream 5
+ DCD DMA2_STR6_IRQHandler ; DMA2 Stream 6
+ DCD DMA2_STR7_IRQHandler ; DMA2 Stream 7
+ DCD USART6_IRQHandler ; USART6
+ DCD I2C3_EV_IRQHandler ; I2C3 event
+ DCD I2C3_ER_IRQHandler ; I2C3 error
+ DCD OTG_HS1_EP1_OUT_IRQHandler ; OTG_HS1 End Point 1 Out
+ DCD OTG_HS1_EP1_IN_IRQHandler ; OTG_HS1 End Point 1 In
+ DCD OTG_HS1_WKUP_IRQHandler ; OTG_HS1 Wakeup through EINT
+ DCD OTG_HS1_IRQHandler ; OTG_HS1
+ DCD DCI_IRQHandler ; DCI
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD FPU_IRQHandler ; FPU
+ DCD SM3_IRQHandler ; SM3
+ DCD SM4_IRQHandler ; SM4
+ DCD BN_IRQHandler ; BN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Default interrupt handlers.
+;;
+ THUMB
+ PUBWEAK Reset_Handler
+ SECTION .text:CODE:REORDER:NOROOT(2)
+Reset_Handler
+
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__iar_program_start
+ BX R0
+
+ PUBWEAK NMI_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+NMI_Handler
+ B NMI_Handler
+
+ PUBWEAK HardFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+HardFault_Handler
+ B HardFault_Handler
+
+ PUBWEAK MemManage_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+MemManage_Handler
+ B MemManage_Handler
+
+ PUBWEAK BusFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+BusFault_Handler
+ B BusFault_Handler
+
+ PUBWEAK UsageFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UsageFault_Handler
+ B UsageFault_Handler
+
+ PUBWEAK SVC_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SVC_Handler
+ B SVC_Handler
+
+ PUBWEAK DebugMon_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DebugMon_Handler
+ B DebugMon_Handler
+
+ PUBWEAK PendSV_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+PendSV_Handler
+ B PendSV_Handler
+
+ PUBWEAK SysTick_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SysTick_Handler
+ B SysTick_Handler
+
+ PUBWEAK WWDT_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+WWDT_IRQHandler
+ B WWDT_IRQHandler
+
+ PUBWEAK PVD_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+PVD_IRQHandler
+ B PVD_IRQHandler
+
+ PUBWEAK TAMP_STAMP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TAMP_STAMP_IRQHandler
+ B TAMP_STAMP_IRQHandler
+
+ PUBWEAK RTC_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RTC_WKUP_IRQHandler
+ B RTC_WKUP_IRQHandler
+
+ PUBWEAK FLASH_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+FLASH_IRQHandler
+ B FLASH_IRQHandler
+
+ PUBWEAK RCM_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RCM_IRQHandler
+ B RCM_IRQHandler
+
+ PUBWEAK EINT0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT0_IRQHandler
+ B EINT0_IRQHandler
+
+ PUBWEAK EINT1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT1_IRQHandler
+ B EINT1_IRQHandler
+
+ PUBWEAK EINT2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT2_IRQHandler
+ B EINT2_IRQHandler
+
+ PUBWEAK EINT3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT3_IRQHandler
+ B EINT3_IRQHandler
+
+ PUBWEAK EINT4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT4_IRQHandler
+ B EINT4_IRQHandler
+
+ PUBWEAK DMA1_STR0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR0_IRQHandler
+ B DMA1_STR0_IRQHandler
+
+ PUBWEAK DMA1_STR1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR1_IRQHandler
+ B DMA1_STR1_IRQHandler
+
+ PUBWEAK DMA1_STR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR2_IRQHandler
+ B DMA1_STR2_IRQHandler
+
+ PUBWEAK DMA1_STR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR3_IRQHandler
+ B DMA1_STR3_IRQHandler
+
+ PUBWEAK DMA1_STR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR4_IRQHandler
+ B DMA1_STR4_IRQHandler
+
+ PUBWEAK DMA1_STR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR5_IRQHandler
+ B DMA1_STR5_IRQHandler
+
+ PUBWEAK DMA1_STR6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR6_IRQHandler
+ B DMA1_STR6_IRQHandler
+
+ PUBWEAK ADC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ADC_IRQHandler
+ B ADC_IRQHandler
+
+ PUBWEAK CAN1_TX_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_TX_IRQHandler
+ B CAN1_TX_IRQHandler
+
+ PUBWEAK CAN1_RX0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_RX0_IRQHandler
+ B CAN1_RX0_IRQHandler
+
+ PUBWEAK CAN1_RX1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_RX1_IRQHandler
+ B CAN1_RX1_IRQHandler
+
+ PUBWEAK CAN1_SCE_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_SCE_IRQHandler
+ B CAN1_SCE_IRQHandler
+
+ PUBWEAK EINT9_5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT9_5_IRQHandler
+ B EINT9_5_IRQHandler
+
+ PUBWEAK TMR1_BRK_TMR9_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_BRK_TMR9_IRQHandler
+ B TMR1_BRK_TMR9_IRQHandler
+
+ PUBWEAK TMR1_UP_TMR10_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_UP_TMR10_IRQHandler
+ B TMR1_UP_TMR10_IRQHandler
+
+ PUBWEAK TMR1_TRG_COM_TMR11_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_TRG_COM_TMR11_IRQHandler
+ B TMR1_TRG_COM_TMR11_IRQHandler
+
+ PUBWEAK TMR1_CC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_CC_IRQHandler
+ B TMR1_CC_IRQHandler
+
+ PUBWEAK TMR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR2_IRQHandler
+ B TMR2_IRQHandler
+
+ PUBWEAK TMR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR3_IRQHandler
+ B TMR3_IRQHandler
+
+ PUBWEAK TMR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR4_IRQHandler
+ B TMR4_IRQHandler
+
+ PUBWEAK I2C1_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_EV_IRQHandler
+ B I2C1_EV_IRQHandler
+
+ PUBWEAK I2C1_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_ER_IRQHandler
+ B I2C1_ER_IRQHandler
+
+ PUBWEAK I2C2_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_EV_IRQHandler
+ B I2C2_EV_IRQHandler
+
+ PUBWEAK I2C2_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_ER_IRQHandler
+ B I2C2_ER_IRQHandler
+
+ PUBWEAK SPI1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI1_IRQHandler
+ B SPI1_IRQHandler
+
+ PUBWEAK SPI2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI2_IRQHandler
+ B SPI2_IRQHandler
+
+ PUBWEAK USART1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART1_IRQHandler
+ B USART1_IRQHandler
+
+ PUBWEAK USART2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART2_IRQHandler
+ B USART2_IRQHandler
+
+ PUBWEAK USART3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART3_IRQHandler
+ B USART3_IRQHandler
+
+ PUBWEAK EINT15_10_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT15_10_IRQHandler
+ B EINT15_10_IRQHandler
+
+ PUBWEAK RTC_Alarm_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RTC_Alarm_IRQHandler
+ B RTC_Alarm_IRQHandler
+
+ PUBWEAK OTG_FS_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_FS_WKUP_IRQHandler
+ B OTG_FS_WKUP_IRQHandler
+
+ PUBWEAK TMR8_BRK_TMR12_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_BRK_TMR12_IRQHandler
+ B TMR8_BRK_TMR12_IRQHandler
+
+ PUBWEAK TMR8_UP_TMR13_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_UP_TMR13_IRQHandler
+ B TMR8_UP_TMR13_IRQHandler
+
+ PUBWEAK TMR8_TRG_COM_TMR14_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_TRG_COM_TMR14_IRQHandler
+ B TMR8_TRG_COM_TMR14_IRQHandler
+
+ PUBWEAK TMR8_CC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_CC_IRQHandler
+ B TMR8_CC_IRQHandler
+
+ PUBWEAK DMA1_STR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR7_IRQHandler
+ B DMA1_STR7_IRQHandler
+
+ PUBWEAK EMMC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EMMC_IRQHandler
+ B EMMC_IRQHandler
+
+ PUBWEAK SDIO_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SDIO_IRQHandler
+ B SDIO_IRQHandler
+
+ PUBWEAK TMR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR5_IRQHandler
+ B TMR5_IRQHandler
+
+ PUBWEAK SPI3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI3_IRQHandler
+ B SPI3_IRQHandler
+
+ PUBWEAK UART4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UART4_IRQHandler
+ B UART4_IRQHandler
+
+ PUBWEAK UART5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UART5_IRQHandler
+ B UART5_IRQHandler
+
+ PUBWEAK TMR6_DAC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR6_DAC_IRQHandler
+ B TMR6_DAC_IRQHandler
+
+ PUBWEAK TMR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR7_IRQHandler
+ B TMR7_IRQHandler
+
+ PUBWEAK DMA2_STR0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR0_IRQHandler
+ B DMA2_STR0_IRQHandler
+
+ PUBWEAK DMA2_STR1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR1_IRQHandler
+ B DMA2_STR1_IRQHandler
+
+ PUBWEAK DMA2_STR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR2_IRQHandler
+ B DMA2_STR2_IRQHandler
+
+ PUBWEAK DMA2_STR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR3_IRQHandler
+ B DMA2_STR3_IRQHandler
+
+ PUBWEAK DMA2_STR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR4_IRQHandler
+ B DMA2_STR4_IRQHandler
+
+ PUBWEAK ETH_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ETH_IRQHandler
+ B ETH_IRQHandler
+
+ PUBWEAK ETH_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ETH_WKUP_IRQHandler
+ B ETH_WKUP_IRQHandler
+
+ PUBWEAK CAN2_TX_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_TX_IRQHandler
+ B CAN2_TX_IRQHandler
+
+ PUBWEAK CAN2_RX0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX0_IRQHandler
+ B CAN2_RX0_IRQHandler
+
+ PUBWEAK CAN2_RX1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX1_IRQHandler
+ B CAN2_RX1_IRQHandler
+
+ PUBWEAK CAN2_SCE_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_SCE_IRQHandler
+ B CAN2_SCE_IRQHandler
+
+ PUBWEAK OTG_FS_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_FS_IRQHandler
+ B OTG_FS_IRQHandler
+
+ PUBWEAK DMA2_STR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR5_IRQHandler
+ B DMA2_STR5_IRQHandler
+
+ PUBWEAK DMA2_STR6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR6_IRQHandler
+ B DMA2_STR6_IRQHandler
+
+ PUBWEAK DMA2_STR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR7_IRQHandler
+ B DMA2_STR7_IRQHandler
+
+ PUBWEAK USART6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART6_IRQHandler
+ B USART6_IRQHandler
+
+ PUBWEAK I2C3_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C3_EV_IRQHandler
+ B I2C3_EV_IRQHandler
+
+ PUBWEAK I2C3_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C3_ER_IRQHandler
+ B I2C3_ER_IRQHandler
+
+ PUBWEAK OTG_HS1_EP1_OUT_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_EP1_OUT_IRQHandler
+ B OTG_HS1_EP1_OUT_IRQHandler
+
+ PUBWEAK OTG_HS1_EP1_IN_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_EP1_IN_IRQHandler
+ B OTG_HS1_EP1_IN_IRQHandler
+
+ PUBWEAK OTG_HS1_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_WKUP_IRQHandler
+ B OTG_HS1_WKUP_IRQHandler
+
+ PUBWEAK OTG_HS1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_IRQHandler
+ B OTG_HS1_IRQHandler
+
+ PUBWEAK DCI_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DCI_IRQHandler
+ B DCI_IRQHandler
+
+ PUBWEAK FPU_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+FPU_IRQHandler
+ B FPU_IRQHandler
+
+ PUBWEAK SM3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SM3_IRQHandler
+ B SM3_IRQHandler
+
+ PUBWEAK SM4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SM4_IRQHandler
+ B SM4_IRQHandler
+
+ PUBWEAK BN_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+BN_IRQHandler
+ B BN_IRQHandler
+
+ END
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f41x.s b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f41x.s
new file mode 100644
index 0000000000..2c3eafadd2
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/iar/startup_apm32f41x.s
@@ -0,0 +1,628 @@
+;/*!
+; * @file startup_apm32f41x.s
+; *
+; * @brief CMSIS Cortex-M4 based Core Device Startup File for Device startup_apm32f41x
+; *
+; * @version V1.0.2
+; *
+; * @date 2022-06-23
+; *
+; * @attention
+; *
+; * Copyright (C) 2021-2022 Geehy Semiconductor
+; *
+; * You may not use this file except in compliance with the
+; * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+; *
+; * The program is only for reference, which is distributed in the hope
+; * that it will be usefull and instructional for customers to develop
+; * their software. Unless required by applicable law or agreed to in
+; * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+; * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+; * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+; * and limitations under the License.
+; */
+
+ MODULE ?cstartup
+
+ ;; Forward declaration of sections.
+ SECTION CSTACK:DATA:NOROOT(3)
+
+ SECTION .intvec:CODE:NOROOT(2)
+
+ EXTERN __iar_program_start
+ EXTERN SystemInit
+ PUBLIC __vector_table
+
+ DATA
+__vector_table
+ DCD sfe(CSTACK)
+ DCD Reset_Handler ; Reset Handler
+
+ DCD NMI_Handler ; NMI Handler
+ DCD HardFault_Handler ; Hard Fault Handler
+ DCD MemManage_Handler ; MPU Fault Handler
+ DCD BusFault_Handler ; Bus Fault Handler
+ DCD UsageFault_Handler ; Usage Fault Handler
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD 0 ; Reserved
+ DCD SVC_Handler ; SVCall Handler
+ DCD DebugMon_Handler ; Debug Monitor Handler
+ DCD 0 ; Reserved
+ DCD PendSV_Handler ; PendSV Handler
+ DCD SysTick_Handler ; SysTick Handler
+
+ ; External Interrupts
+ DCD WWDT_IRQHandler ; Window WatchDog
+ DCD PVD_IRQHandler ; PVD through EINT Line detection
+ DCD TAMP_STAMP_IRQHandler ; Tamper and TimeStamps through the EINT line
+ DCD RTC_WKUP_IRQHandler ; RTC Wakeup through the EINT line
+ DCD FLASH_IRQHandler ; FLASH
+ DCD RCM_IRQHandler ; RCM
+ DCD EINT0_IRQHandler ; EINT Line0
+ DCD EINT1_IRQHandler ; EINT Line1
+ DCD EINT2_IRQHandler ; EINT Line2
+ DCD EINT3_IRQHandler ; EINT Line3
+ DCD EINT4_IRQHandler ; EINT Line4
+ DCD DMA1_STR0_IRQHandler ; DMA1 Stream 0
+ DCD DMA1_STR1_IRQHandler ; DMA1 Stream 1
+ DCD DMA1_STR2_IRQHandler ; DMA1 Stream 2
+ DCD DMA1_STR3_IRQHandler ; DMA1 Stream 3
+ DCD DMA1_STR4_IRQHandler ; DMA1 Stream 4
+ DCD DMA1_STR5_IRQHandler ; DMA1 Stream 5
+ DCD DMA1_STR6_IRQHandler ; DMA1 Stream 6
+ DCD ADC_IRQHandler ; ADC1, ADC2 and ADC3s
+ DCD CAN1_TX_IRQHandler ; CAN1 TX
+ DCD CAN1_RX0_IRQHandler ; CAN1 RX0
+ DCD CAN1_RX1_IRQHandler ; CAN1 RX1
+ DCD CAN1_SCE_IRQHandler ; CAN1 SCE
+ DCD EINT9_5_IRQHandler ; External Line[9:5]s
+ DCD TMR1_BRK_TMR9_IRQHandler ; TMR1 Break and TMR9
+ DCD TMR1_UP_TMR10_IRQHandler ; TMR1 Update and TMR10
+ DCD TMR1_TRG_COM_TMR11_IRQHandler ; TMR1 Trigger and Commutation and TMR11
+ DCD TMR1_CC_IRQHandler ; TMR1 Capture Compare
+ DCD TMR2_IRQHandler ; TMR2
+ DCD TMR3_IRQHandler ; TMR3
+ DCD TMR4_IRQHandler ; TMR4
+ DCD I2C1_EV_IRQHandler ; I2C1 Event
+ DCD I2C1_ER_IRQHandler ; I2C1 Error
+ DCD I2C2_EV_IRQHandler ; I2C2 Event
+ DCD I2C2_ER_IRQHandler ; I2C2 Error
+ DCD SPI1_IRQHandler ; SPI1
+ DCD SPI2_IRQHandler ; SPI2
+ DCD USART1_IRQHandler ; USART1
+ DCD USART2_IRQHandler ; USART2
+ DCD USART3_IRQHandler ; USART3
+ DCD EINT15_10_IRQHandler ; External Line[15:10]s
+ DCD RTC_Alarm_IRQHandler ; RTC Alarm (A and B) through EINT Line
+ DCD OTG_FS_WKUP_IRQHandler ; OTG_FS Wakeup through EINT line
+ DCD TMR8_BRK_TMR12_IRQHandler ; TMR8 Break and TMR12
+ DCD TMR8_UP_TMR13_IRQHandler ; TMR8 Update and TMR13
+ DCD TMR8_TRG_COM_TMR14_IRQHandler ; TMR8 Trigger and Commutation and TMR14
+ DCD TMR8_CC_IRQHandler ; TMR8 Capture Compare
+ DCD DMA1_STR7_IRQHandler ; DMA1 Stream 7
+ DCD EMMC_IRQHandler ; EMMC
+ DCD SDIO_IRQHandler ; SDIO
+ DCD TMR5_IRQHandler ; TMR5
+ DCD SPI3_IRQHandler ; SPI3
+ DCD UART4_IRQHandler ; UART4
+ DCD UART5_IRQHandler ; UART5
+ DCD TMR6_DAC_IRQHandler ; TMR6 and DAC1&2 underrun errors
+ DCD TMR7_IRQHandler ; TMR7
+ DCD DMA2_STR0_IRQHandler ; DMA2 Stream 0
+ DCD DMA2_STR1_IRQHandler ; DMA2 Stream 1
+ DCD DMA2_STR2_IRQHandler ; DMA2 Stream 2
+ DCD DMA2_STR3_IRQHandler ; DMA2 Stream 3
+ DCD DMA2_STR4_IRQHandler ; DMA2 Stream 4
+ DCD ETH_IRQHandler ; Ethernet
+ DCD ETH_WKUP_IRQHandler ; Ethernet Wakeup through EINT line
+ DCD CAN2_TX_IRQHandler ; CAN2 TX
+ DCD CAN2_RX0_IRQHandler ; CAN2 RX0
+ DCD CAN2_RX1_IRQHandler ; CAN2 RX1
+ DCD CAN2_SCE_IRQHandler ; CAN2 SCE
+ DCD OTG_FS_IRQHandler ; OTG_FS
+ DCD DMA2_STR5_IRQHandler ; DMA2 Stream 5
+ DCD DMA2_STR6_IRQHandler ; DMA2 Stream 6
+ DCD DMA2_STR7_IRQHandler ; DMA2 Stream 7
+ DCD USART6_IRQHandler ; USART6
+ DCD I2C3_EV_IRQHandler ; I2C3 event
+ DCD I2C3_ER_IRQHandler ; I2C3 error
+ DCD OTG_HS1_EP1_OUT_IRQHandler ; OTG_HS1 End Point 1 Out
+ DCD OTG_HS1_EP1_IN_IRQHandler ; OTG_HS1 End Point 1 In
+ DCD OTG_HS1_WKUP_IRQHandler ; OTG_HS1 Wakeup through EINT
+ DCD OTG_HS1_IRQHandler ; OTG_HS1
+ DCD DCI_IRQHandler ; DCI
+ DCD CRYP_IRQHandler ; CRYP crypto
+ DCD HASH_RNG_IRQHandler ; Hash and Rng
+ DCD FPU_IRQHandler ; FPU
+ DCD SM3_IRQHandler ; SM3
+ DCD SM4_IRQHandler ; SM4
+ DCD BN_IRQHandler ; BN
+
+;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
+;;
+;; Default interrupt handlers.
+;;
+ THUMB
+ PUBWEAK Reset_Handler
+ SECTION .text:CODE:REORDER:NOROOT(2)
+Reset_Handler
+
+ LDR R0, =SystemInit
+ BLX R0
+ LDR R0, =__iar_program_start
+ BX R0
+
+ PUBWEAK NMI_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+NMI_Handler
+ B NMI_Handler
+
+ PUBWEAK HardFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+HardFault_Handler
+ B HardFault_Handler
+
+ PUBWEAK MemManage_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+MemManage_Handler
+ B MemManage_Handler
+
+ PUBWEAK BusFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+BusFault_Handler
+ B BusFault_Handler
+
+ PUBWEAK UsageFault_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UsageFault_Handler
+ B UsageFault_Handler
+
+ PUBWEAK SVC_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SVC_Handler
+ B SVC_Handler
+
+ PUBWEAK DebugMon_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DebugMon_Handler
+ B DebugMon_Handler
+
+ PUBWEAK PendSV_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+PendSV_Handler
+ B PendSV_Handler
+
+ PUBWEAK SysTick_Handler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SysTick_Handler
+ B SysTick_Handler
+
+ PUBWEAK WWDT_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+WWDT_IRQHandler
+ B WWDT_IRQHandler
+
+ PUBWEAK PVD_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+PVD_IRQHandler
+ B PVD_IRQHandler
+
+ PUBWEAK TAMP_STAMP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TAMP_STAMP_IRQHandler
+ B TAMP_STAMP_IRQHandler
+
+ PUBWEAK RTC_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RTC_WKUP_IRQHandler
+ B RTC_WKUP_IRQHandler
+
+ PUBWEAK FLASH_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+FLASH_IRQHandler
+ B FLASH_IRQHandler
+
+ PUBWEAK RCM_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RCM_IRQHandler
+ B RCM_IRQHandler
+
+ PUBWEAK EINT0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT0_IRQHandler
+ B EINT0_IRQHandler
+
+ PUBWEAK EINT1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT1_IRQHandler
+ B EINT1_IRQHandler
+
+ PUBWEAK EINT2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT2_IRQHandler
+ B EINT2_IRQHandler
+
+ PUBWEAK EINT3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT3_IRQHandler
+ B EINT3_IRQHandler
+
+ PUBWEAK EINT4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT4_IRQHandler
+ B EINT4_IRQHandler
+
+ PUBWEAK DMA1_STR0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR0_IRQHandler
+ B DMA1_STR0_IRQHandler
+
+ PUBWEAK DMA1_STR1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR1_IRQHandler
+ B DMA1_STR1_IRQHandler
+
+ PUBWEAK DMA1_STR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR2_IRQHandler
+ B DMA1_STR2_IRQHandler
+
+ PUBWEAK DMA1_STR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR3_IRQHandler
+ B DMA1_STR3_IRQHandler
+
+ PUBWEAK DMA1_STR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR4_IRQHandler
+ B DMA1_STR4_IRQHandler
+
+ PUBWEAK DMA1_STR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR5_IRQHandler
+ B DMA1_STR5_IRQHandler
+
+ PUBWEAK DMA1_STR6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR6_IRQHandler
+ B DMA1_STR6_IRQHandler
+
+ PUBWEAK ADC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ADC_IRQHandler
+ B ADC_IRQHandler
+
+ PUBWEAK CAN1_TX_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_TX_IRQHandler
+ B CAN1_TX_IRQHandler
+
+ PUBWEAK CAN1_RX0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_RX0_IRQHandler
+ B CAN1_RX0_IRQHandler
+
+ PUBWEAK CAN1_RX1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_RX1_IRQHandler
+ B CAN1_RX1_IRQHandler
+
+ PUBWEAK CAN1_SCE_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN1_SCE_IRQHandler
+ B CAN1_SCE_IRQHandler
+
+ PUBWEAK EINT9_5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT9_5_IRQHandler
+ B EINT9_5_IRQHandler
+
+ PUBWEAK TMR1_BRK_TMR9_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_BRK_TMR9_IRQHandler
+ B TMR1_BRK_TMR9_IRQHandler
+
+ PUBWEAK TMR1_UP_TMR10_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_UP_TMR10_IRQHandler
+ B TMR1_UP_TMR10_IRQHandler
+
+ PUBWEAK TMR1_TRG_COM_TMR11_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_TRG_COM_TMR11_IRQHandler
+ B TMR1_TRG_COM_TMR11_IRQHandler
+
+ PUBWEAK TMR1_CC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR1_CC_IRQHandler
+ B TMR1_CC_IRQHandler
+
+ PUBWEAK TMR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR2_IRQHandler
+ B TMR2_IRQHandler
+
+ PUBWEAK TMR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR3_IRQHandler
+ B TMR3_IRQHandler
+
+ PUBWEAK TMR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR4_IRQHandler
+ B TMR4_IRQHandler
+
+ PUBWEAK I2C1_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_EV_IRQHandler
+ B I2C1_EV_IRQHandler
+
+ PUBWEAK I2C1_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C1_ER_IRQHandler
+ B I2C1_ER_IRQHandler
+
+ PUBWEAK I2C2_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_EV_IRQHandler
+ B I2C2_EV_IRQHandler
+
+ PUBWEAK I2C2_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C2_ER_IRQHandler
+ B I2C2_ER_IRQHandler
+
+ PUBWEAK SPI1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI1_IRQHandler
+ B SPI1_IRQHandler
+
+ PUBWEAK SPI2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI2_IRQHandler
+ B SPI2_IRQHandler
+
+ PUBWEAK USART1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART1_IRQHandler
+ B USART1_IRQHandler
+
+ PUBWEAK USART2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART2_IRQHandler
+ B USART2_IRQHandler
+
+ PUBWEAK USART3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART3_IRQHandler
+ B USART3_IRQHandler
+
+ PUBWEAK EINT15_10_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EINT15_10_IRQHandler
+ B EINT15_10_IRQHandler
+
+ PUBWEAK RTC_Alarm_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+RTC_Alarm_IRQHandler
+ B RTC_Alarm_IRQHandler
+
+ PUBWEAK OTG_FS_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_FS_WKUP_IRQHandler
+ B OTG_FS_WKUP_IRQHandler
+
+ PUBWEAK TMR8_BRK_TMR12_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_BRK_TMR12_IRQHandler
+ B TMR8_BRK_TMR12_IRQHandler
+
+ PUBWEAK TMR8_UP_TMR13_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_UP_TMR13_IRQHandler
+ B TMR8_UP_TMR13_IRQHandler
+
+ PUBWEAK TMR8_TRG_COM_TMR14_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_TRG_COM_TMR14_IRQHandler
+ B TMR8_TRG_COM_TMR14_IRQHandler
+
+ PUBWEAK TMR8_CC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR8_CC_IRQHandler
+ B TMR8_CC_IRQHandler
+
+ PUBWEAK DMA1_STR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA1_STR7_IRQHandler
+ B DMA1_STR7_IRQHandler
+
+ PUBWEAK EMMC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+EMMC_IRQHandler
+ B EMMC_IRQHandler
+
+ PUBWEAK SDIO_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SDIO_IRQHandler
+ B SDIO_IRQHandler
+
+ PUBWEAK TMR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR5_IRQHandler
+ B TMR5_IRQHandler
+
+ PUBWEAK SPI3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SPI3_IRQHandler
+ B SPI3_IRQHandler
+
+ PUBWEAK UART4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UART4_IRQHandler
+ B UART4_IRQHandler
+
+ PUBWEAK UART5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+UART5_IRQHandler
+ B UART5_IRQHandler
+
+ PUBWEAK TMR6_DAC_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR6_DAC_IRQHandler
+ B TMR6_DAC_IRQHandler
+
+ PUBWEAK TMR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+TMR7_IRQHandler
+ B TMR7_IRQHandler
+
+ PUBWEAK DMA2_STR0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR0_IRQHandler
+ B DMA2_STR0_IRQHandler
+
+ PUBWEAK DMA2_STR1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR1_IRQHandler
+ B DMA2_STR1_IRQHandler
+
+ PUBWEAK DMA2_STR2_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR2_IRQHandler
+ B DMA2_STR2_IRQHandler
+
+ PUBWEAK DMA2_STR3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR3_IRQHandler
+ B DMA2_STR3_IRQHandler
+
+ PUBWEAK DMA2_STR4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR4_IRQHandler
+ B DMA2_STR4_IRQHandler
+
+ PUBWEAK ETH_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ETH_IRQHandler
+ B ETH_IRQHandler
+
+ PUBWEAK ETH_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+ETH_WKUP_IRQHandler
+ B ETH_WKUP_IRQHandler
+
+ PUBWEAK CAN2_TX_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_TX_IRQHandler
+ B CAN2_TX_IRQHandler
+
+ PUBWEAK CAN2_RX0_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX0_IRQHandler
+ B CAN2_RX0_IRQHandler
+
+ PUBWEAK CAN2_RX1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_RX1_IRQHandler
+ B CAN2_RX1_IRQHandler
+
+ PUBWEAK CAN2_SCE_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CAN2_SCE_IRQHandler
+ B CAN2_SCE_IRQHandler
+
+ PUBWEAK OTG_FS_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_FS_IRQHandler
+ B OTG_FS_IRQHandler
+
+ PUBWEAK DMA2_STR5_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR5_IRQHandler
+ B DMA2_STR5_IRQHandler
+
+ PUBWEAK DMA2_STR6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR6_IRQHandler
+ B DMA2_STR6_IRQHandler
+
+ PUBWEAK DMA2_STR7_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DMA2_STR7_IRQHandler
+ B DMA2_STR7_IRQHandler
+
+ PUBWEAK USART6_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+USART6_IRQHandler
+ B USART6_IRQHandler
+
+ PUBWEAK I2C3_EV_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C3_EV_IRQHandler
+ B I2C3_EV_IRQHandler
+
+ PUBWEAK I2C3_ER_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+I2C3_ER_IRQHandler
+ B I2C3_ER_IRQHandler
+
+ PUBWEAK OTG_HS1_EP1_OUT_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_EP1_OUT_IRQHandler
+ B OTG_HS1_EP1_OUT_IRQHandler
+
+ PUBWEAK OTG_HS1_EP1_IN_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_EP1_IN_IRQHandler
+ B OTG_HS1_EP1_IN_IRQHandler
+
+ PUBWEAK OTG_HS1_WKUP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_WKUP_IRQHandler
+ B OTG_HS1_WKUP_IRQHandler
+
+ PUBWEAK OTG_HS1_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+OTG_HS1_IRQHandler
+ B OTG_HS1_IRQHandler
+
+ PUBWEAK DCI_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+DCI_IRQHandler
+ B DCI_IRQHandler
+
+ PUBWEAK CRYP_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+CRYP_IRQHandler
+ B CRYP_IRQHandler
+
+ PUBWEAK HASH_RNG_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+HASH_RNG_IRQHandler
+ B HASH_RNG_IRQHandler
+
+ PUBWEAK FPU_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+FPU_IRQHandler
+ B FPU_IRQHandler
+
+ PUBWEAK SM3_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SM3_IRQHandler
+ B SM3_IRQHandler
+
+ PUBWEAK SM4_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+SM4_IRQHandler
+ B SM4_IRQHandler
+
+ PUBWEAK BN_IRQHandler
+ SECTION .text:CODE:REORDER:NOROOT(1)
+BN_IRQHandler
+ B BN_IRQHandler
+
+ END
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/system_apm32f4xx.c b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/system_apm32f4xx.c
new file mode 100644
index 0000000000..6db877aaf8
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/Device/Geehy/APM32F4xx/Source/system_apm32f4xx.c
@@ -0,0 +1,337 @@
+/*!
+ * @file system_apm32f4xx.c
+ *
+ * @brief CMSIS Cortex-M4 Device Peripheral Access Layer System Source File
+ *
+ * @version V1.0.2
+ *
+ * @date 2022-06-23
+ *
+ * @attention
+ *
+ * Copyright (C) 2021-2022 Geehy Semiconductor
+ *
+ * You may not use this file except in compliance with the
+ * GEEHY COPYRIGHT NOTICE (GEEHY SOFTWARE PACKAGE LICENSE).
+ *
+ * The program is only for reference, which is distributed in the hope
+ * that it will be usefull and instructional for customers to develop
+ * their software. Unless required by applicable law or agreed to in
+ * writing, the program is distributed on an "AS IS" BASIS, WITHOUT
+ * ANY WARRANTY OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the GEEHY SOFTWARE PACKAGE LICENSE for the governing permissions
+ * and limitations under the License.
+ */
+
+#include "apm32f4xx.h"
+#include "system_apm32f4xx.h"
+
+/** @addtogroup CMSIS
+ @{
+*/
+
+/** @addtogroup APM32F4xx_System
+ * @brief APM32F4xx system configuration
+ @{
+*/
+
+/** @defgroup System_Macros
+ @{
+*/
+
+/* Uncomment the following line if you need to use external SRAM as data memory */
+/* #define DATA_IN_ExtSRAM */
+
+/* Uncomment the following line if you need to relocate your vector Table in Internal SRAM. */
+/* #define VECT_TAB_SRAM */
+
+/* Vector Table base offset field. This value must be a multiple of 0x200. */
+#define VECT_TAB_OFFSET 0x00
+
+/* PLL_VCO = (HSE_VALUE or HSI_VALUE / PLL_B) * PLL_A */
+#define PLL_B 8
+
+/* USB OTG FS, SDIO and RNG Clock = PLL_VCO / PLL_D */
+#define PLL_D 7
+
+#define PLL_A 336
+/* SYSCLK = PLL_VCO / PLL_C */
+#define PLL_C 2
+
+/**@} end of group System_Macros*/
+
+/** @defgroup System_Variables
+ @{
+*/
+
+/**
+ * @brief APM32F4xx_System_Private_Variables
+ */
+
+uint32_t SystemCoreClock = 168000000;
+
+__I uint8_t AHBPrescTable[16] = {0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 6, 7, 8, 9};
+
+/**@} end of group System_Variables */
+
+/** @defgroup System_Functions
+ @{
+*/
+
+/**
+ * @brief APM32F4xx_System_Private_FunctionPrototypes
+ */
+
+static void SystemClockConfig(void);
+
+#if defined(DATA_IN_ExtSRAM)
+static void SystemInit_ExtSRAM(void);
+#endif /* DATA_IN_ExtSRAM */
+
+/*!
+ * @brief Setup the microcontroller system
+ *
+ * @param None
+ *
+ * @retval None
+ */
+void SystemInit(void)
+{
+ /* FPU settings */
+ #if (__FPU_PRESENT == 1) && (__FPU_USED == 1)
+ SCB->CPACR |= ((3UL << 10*2)|(3UL << 11*2)); //!< set CP10 and CP11 Full Access
+ #endif
+ /* Reset the RCM clock configuration to the default reset state */
+ /* Set HSIEN bit */
+ RCM->CTRL_B.HSIEN = BIT_SET;
+
+ /* Reset CFG register */
+ RCM->CFG = 0x00000000;
+
+ /* Reset HSEEN, CSSEN and PLL1EN bits */
+ RCM->CTRL &= (uint32_t)0xFEF6FFFF;
+
+ /* Reset PLL1CFG register */
+ RCM->PLL1CFG = 0x24003010;
+
+ /* Reset HSEBCFG bit */
+ RCM->CTRL &= (uint32_t)0xFFFBFFFF;
+
+ /* Disable all interrupts */
+ RCM->INT = 0x00000000;
+
+ #if defined(DATA_IN_ExtSRAM)
+ SystemInit_ExtSRAM();
+ #endif /* DATA_IN_ExtSRAM */
+
+ SystemClockConfig();
+
+ /* Configure the Vector Table location add offset address */
+ #ifdef VECT_TAB_SRAM
+ SCB->VTOR = SRAM_BASE | VECT_TAB_OFFSET; /* Vector Table Relocation in Internal SRAM */
+ #else
+ SCB->VTOR = FMC_BASE | VECT_TAB_OFFSET; /* Vector Table Relocation in Internal FLASH */
+ #endif
+}
+
+/*!
+ * @brief Update SystemCoreClock variable according to Clock Register Values
+ * The SystemCoreClock variable contains the core clock (HCLK)
+ *
+ * @param None
+ *
+ * @retval None
+ */
+void SystemCoreClockUpdate(void)
+{
+ uint32_t sysClock, pllvco, pllc, pllClock, pllb;
+
+ /* Get SYSCLK source */
+ sysClock = RCM->CFG_B.SCLKSWSTS;
+
+ switch (sysClock)
+ {
+ case 0:
+ SystemCoreClock = HSI_VALUE;
+ break;
+
+ case 1:
+ SystemCoreClock = HSE_VALUE;
+ break;
+
+ case 2:
+ pllClock = RCM->PLL1CFG_B.PLL1CLKS;
+ pllb = RCM->PLL1CFG_B.PLLB;
+
+ if (pllClock == 0)
+ {
+ /* HSI used as PLL clock source */
+ pllvco = (HSI_VALUE / pllb) * (RCM->PLL1CFG_B.PLL1A); //!< PLL_VCO = (HSI_VALUE / PLL_B) * PLL_A */
+ }
+ else
+ {
+ /* HSE used as PLL clock source */
+ pllvco = (HSE_VALUE / pllb) * (RCM->PLL1CFG_B.PLL1A); //!< PLL_VCO = (HSE_VALUE / PLL_B) * PLL_A */
+ }
+
+ pllc = ((RCM->PLL1CFG_B.PLL1C) + 1 ) *2;
+ SystemCoreClock = pllvco/pllc;
+ break;
+
+ default:
+ SystemCoreClock = HSI_VALUE;
+ break;
+ }
+
+ /* Compute HCLK frequency */
+ /* Get HCLK prescaler */
+ sysClock = AHBPrescTable[(RCM->CFG_B.AHBPSC)];
+ /* HCLK frequency */
+ SystemCoreClock >>= sysClock;
+}
+
+/*!
+ * @brief Configures the System clock source, PLL Multiplier and Divider factors,
+ * AHB/APBx prescalers and Flash settings
+ *
+ * @param None
+ *
+ * @retval None
+ */
+static void SystemClockConfig(void)
+{
+ __IO uint32_t i;
+
+ RCM->CTRL_B.HSEEN = BIT_SET;
+
+ for (i = 0; i < HSE_STARTUP_TIMEOUT; i++)
+ {
+ if (RCM->CTRL_B.HSERDYFLG)
+ {
+ break;
+ }
+ }
+
+ if (RCM->CTRL_B.HSERDYFLG)
+ {
+ /* Select regulator voltage output Scale 1 mode */
+ RCM->APB1CLKEN_B.PMUEN = BIT_SET;
+ PMU->CTRL_B.VOSSEL = BIT_SET;
+
+ /* HCLK = SYSCLK / 1*/
+ RCM->CFG_B.AHBPSC = 0x0000;
+
+ /* PCLK2 = HCLK / 2*/
+ RCM->CFG_B.APB2PSC = 0x04;
+
+ /* PCLK1 = HCLK / 4*/
+ RCM->CFG_B.APB1PSC = 0x05;
+
+ /* Configure the main PLL */
+ RCM->PLL1CFG = PLL_B | (PLL_A << 6) | (((PLL_C >> 1) -1) << 16) |(PLL_D << 24);
+ RCM->PLL1CFG_B.PLL1CLKS = 0x01;
+
+ /* Enable the main PLL */
+ RCM->CTRL_B.PLL1EN = BIT_SET;
+
+ /* Wait till the main PLL is ready */
+ while (RCM->CTRL_B.PLL1RDYFLG == 0)
+ {
+ }
+
+ /* Configure Flash prefetch, Instruction cache, Data cache and wait state */
+ FMC->ACCTRL = 0x05 | BIT8 | BIT9 | BIT10;
+
+ /* Select the main PLL as system clock source */
+ RCM->CFG_B.SCLKSEL = RESET;
+ RCM->CFG_B.SCLKSEL = 0x02;
+
+ /* Wait till the main PLL is used as system clock source */
+ while ((RCM->CFG_B.SCLKSWSTS) != 0x02)
+ {
+ }
+ }
+ else
+ {
+ /* If HSE fails to start-up, the application will have wrong clock configuration. */
+ }
+}
+
+#if defined (DATA_IN_ExtSRAM)
+
+/*!
+ * @brief Setup the external memory controller. Called in startup_apm32f4xx.s before jump to main.
+ * This function configures the external SRAM
+ * This SRAM will be used as program data memory (including heap and stack).
+ *
+ * @param None
+ *
+ * @retval None
+ */
+void SystemInit_ExtSRAM(void)
+{
+ /* Enable GPIOD, GPIOE, GPIOF and GPIOG interface clock */
+ RCM->AHB1CLKEN |= 0x00000078;
+
+ /* Connect PDx pins to FMC Alternate function */
+ GPIOD->ALFL = 0x00cc00cc;
+ GPIOD->ALFH = 0xcccccccc;
+ /* Configure PDx pins in Alternate function mode */
+ GPIOD->MODE = 0xaaaa0a0a;
+ /* Configure PDx pins speed to 100 MHz */
+ GPIOD->OSSEL = 0xffff0f0f;
+ /* Configure PDx pins Output type to push-pull */
+ GPIOD->OMODE = 0x00000000;
+ /* No pull-up, pull-down for PDx pins */
+ GPIOD->PUPD = 0x00000000;
+
+ /* Connect PEx pins to FMC Alternate function */
+ GPIOE->ALFL = 0xcccccccc;
+ GPIOE->ALFH = 0xcccccccc;
+ /* Configure PEx pins in Alternate function mode */
+ GPIOE->MODE = 0xaaaaaaaa;
+ /* Configure PEx pins speed to 100 MHz */
+ GPIOE->OSSEL = 0xffffffff;
+ /* Configure PEx pins Output type to push-pull */
+ GPIOE->OMODE = 0x00000000;
+ /* No pull-up, pull-down for PEx pins */
+ GPIOE->PUPD = 0x00000000;
+
+ /* Connect PFx pins to FMC Alternate function */
+ GPIOF->ALFL = 0x00cccccc;
+ GPIOF->ALFH = 0xcccc0000;
+ /* Configure PFx pins in Alternate function mode */
+ GPIOF->MODE = 0xaa000aaa;
+ /* Configure PFx pins speed to 100 MHz */
+ GPIOF->OSSEL = 0xff000fff;
+ /* Configure PFx pins Output type to push-pull */
+ GPIOF->OMODE = 0x00000000;
+ /* No pull-up, pull-down for PFx pins */
+ GPIOF->PUPD = 0x00000000;
+
+ /* Connect PGx pins to FMC Alternate function */
+ GPIOG->ALFL = 0x00cccccc;
+ GPIOG->ALFH = 0x000000c0;
+ /* Configure PGx pins in Alternate function mode */
+ GPIOG->MODE = 0x00080aaa;
+ /* Configure PGx pins speed to 100 MHz */
+ GPIOG->OSSEL = 0x000c0fff;
+ /* Configure PGx pins Output type to push-pull */
+ GPIOG->OMODE = 0x00000000;
+ /* No pull-up, pull-down for PGx pins */
+ GPIOG->PUPD = 0x00000000;
+
+ /* FMC Configuration */
+ /* Enable the FMC/SMC interface clock */
+ RCM->AHB3CLKEN |= 0x00000001;
+
+ /* Configure and enable Bank1_SRAM2 */
+ SMC_Bank1->CSCTRL2 = 0x00001011;
+ SMC_Bank1->CSTIM2 = 0x00000201;
+ SMC_Bank1E->WRTTIM2 = 0x0fffffff;
+}
+#endif /* DATA_IN_ExtSRAM */
+
+/**@} end of group System_Functions */
+/**@} end of group APM32F4xx_System */
+/**@} end of group CMSIS */
diff --git a/bsp/apm32/libraries/APM32F4xx_Library/SConscript b/bsp/apm32/libraries/APM32F4xx_Library/SConscript
new file mode 100644
index 0000000000..8785ae70e9
--- /dev/null
+++ b/bsp/apm32/libraries/APM32F4xx_Library/SConscript
@@ -0,0 +1,45 @@
+import rtconfig
+Import('RTT_ROOT')
+from building import *
+
+# get current directory
+cwd = GetCurrentDir()
+
+# The set of source files associated with this SConscript file.
+src = Split("""
+Device/Geehy/APM32F4xx/Source/system_apm32f4xx.c
+APM32F4xx_StdPeriphDriver/src/apm32f4xx_gpio.c
+APM32F4xx_StdPeriphDriver/src/apm32f4xx_misc.c
+APM32F4xx_StdPeriphDriver/src/apm32f4xx_rcm.c
+APM32F4xx_StdPeriphDriver/src/apm32f4xx_usart.c
+APM32F4xx_StdPeriphDriver/src/apm32f4xx_eint.c
+""")
+
+if GetDepend(['RT_USING_ADC']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_adc.c']
+
+if GetDepend(['RT_USING_DAC']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_dac.c']
+
+if GetDepend(['RT_USING_RTC']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_rtc.c']
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_pmu.c']
+
+if GetDepend(['RT_USING_SPI']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_spi.c']
+
+if GetDepend(['RT_USING_HWTIMER']) or GetDepend(['RT_USING_PWM']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_tmr.c']
+
+if GetDepend(['RT_USING_WDT']):
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_wwdt.c']
+ src += ['APM32F4xx_StdPeriphDriver/src/apm32f4xx_iwdt.c']
+
+path = [cwd + '/Device/Geehy/APM32F4xx/Include',
+ cwd + '/APM32F4xx_StdPeriphDriver/inc',
+ cwd + '/CMSIS/Include']
+
+CPPDEFINES = ['USE_STDPERIPH_DRIVER']
+group = DefineGroup('Libraries', src, depend = [''], CPPPATH = path, CPPDEFINES = CPPDEFINES)
+
+Return('group')
diff --git a/bsp/apm32/libraries/Drivers/drv_adc.c b/bsp/apm32/libraries/Drivers/drv_adc.c
index 162f5afbdd..40b796db06 100644
--- a/bsp/apm32/libraries/Drivers/drv_adc.c
+++ b/bsp/apm32/libraries/Drivers/drv_adc.c
@@ -6,12 +6,14 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include
#if defined(BSP_USING_ADC1) || defined(BSP_USING_ADC2) || defined(BSP_USING_ADC3)
+//#define DRV_DEBUG
#define LOG_TAG "drv.adc"
#define DBG_LVL DBG_INFO
#include
@@ -30,7 +32,7 @@ struct apm32_adc
rt_base_t channel_pin[DRV_ADC_CHANNEL_MAX_NUM];
struct rt_adc_device adc_dev;
};
-
+#ifdef APM32F10X_HD
static struct apm32_adc adc_config[] =
{
#ifdef BSP_USING_ADC1
@@ -93,11 +95,78 @@ static struct apm32_adc adc_config[] =
},
#endif
};
-
+#elif APM32F40X
+static struct apm32_adc adc_config[] =
+{
+#ifdef BSP_USING_ADC1
+ {
+ "adc1",
+ ADC1,
+ {
+ ADC_RESOLUTION_12BIT,
+ DISABLE,
+ DISABLE,
+ ADC_EXT_TRIG_EDGE_NONE,
+ ADC_EXT_TRIG_CONV_TMR1_CC1,
+ ADC_DATA_ALIGN_RIGHT,
+ 1
+ },
+ {
+ GET_PIN(A, 0), GET_PIN(A, 1), GET_PIN(A, 2), GET_PIN(A, 3), GET_PIN(A, 4),
+ GET_PIN(A, 5), GET_PIN(A, 6), GET_PIN(A, 7), GET_PIN(B, 0), GET_PIN(B, 1),
+ GET_PIN(C, 0), GET_PIN(C, 1), GET_PIN(C, 2), GET_PIN(C, 3)
+ },
+ RT_NULL
+ },
+#endif
+#ifdef BSP_USING_ADC2
+ {
+ "adc2",
+ ADC2,
+ {
+ ADC_RESOLUTION_12BIT,
+ DISABLE,
+ DISABLE,
+ ADC_EXT_TRIG_EDGE_NONE,
+ ADC_EXT_TRIG_CONV_TMR1_CC1,
+ ADC_DATA_ALIGN_RIGHT,
+ 1
+ },
+ {
+ GET_PIN(A, 0), GET_PIN(A, 1), GET_PIN(A, 2), GET_PIN(A, 3), GET_PIN(A, 4),
+ GET_PIN(A, 5), GET_PIN(A, 6), GET_PIN(A, 7), GET_PIN(B, 0), GET_PIN(B, 1),
+ GET_PIN(C, 0), GET_PIN(C, 1), GET_PIN(C, 2), GET_PIN(C, 3)
+ },
+ RT_NULL
+ },
+#endif
+#ifdef BSP_USING_ADC3
+ {
+ "adc3",
+ ADC3,
+ {
+ ADC_RESOLUTION_12BIT,
+ DISABLE,
+ DISABLE,
+ ADC_EXT_TRIG_EDGE_NONE,
+ ADC_EXT_TRIG_CONV_TMR1_CC1,
+ ADC_DATA_ALIGN_RIGHT,
+ 1
+ },
+ {
+ GET_PIN(A, 0), GET_PIN(A, 1), GET_PIN(A, 2), GET_PIN(A, 3), GET_PIN(F, 6),
+ GET_PIN(F, 7), GET_PIN(F, 8), GET_PIN(F, 9), GET_PIN(F, 10), GET_PIN(F, 3),
+ GET_PIN(C, 0), GET_PIN(C, 1), GET_PIN(C, 2), GET_PIN(C, 3)
+ },
+ RT_NULL
+ },
+#endif
+};
+#endif
static rt_err_t _adc_channel_check(struct rt_adc_device *device, rt_uint32_t channel)
{
struct apm32_adc *adc_cfg = ((struct apm32_adc *)device->parent.user_data);
-
+#ifdef APM32F10X_HD
if (adc_cfg->adc == ADC3)
{
if (channel <= 8)
@@ -112,7 +181,12 @@ static rt_err_t _adc_channel_check(struct rt_adc_device *device, rt_uint32_t cha
return RT_EOK;
}
}
-
+#elif APM32F40X
+ if (channel <= 13)
+ {
+ return RT_EOK;
+ }
+#endif
LOG_E("channel %d of %s is not supported.", channel, adc_cfg->name);
return -RT_ERROR;
}
@@ -126,10 +200,13 @@ static rt_err_t _adc_gpio_init(struct rt_adc_device *device, rt_uint32_t channel
{
return -RT_ERROR;
}
-
+#ifdef APM32F10X_HD
RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOA << ((adc_cfg->channel_pin[channel] >> 4) & 0xFu));
-
hw_gpio_config.mode = GPIO_MODE_ANALOG;
+#elif APM32F40X
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOA << ((adc_cfg->channel_pin[channel] >> 4) & 0xFu));
+ hw_gpio_config.mode = GPIO_MODE_AN;
+#endif
hw_gpio_config.pin = _ADC_GET_PIN(adc_cfg->channel_pin[channel]);
GPIO_Config(_ADC_GET_PORT(adc_cfg->channel_pin[channel]), &hw_gpio_config);
@@ -207,6 +284,7 @@ static rt_err_t _adc_get_value(struct rt_adc_device *device, rt_uint32_t channel
{
return -RT_ERROR;
}
+#ifdef APM32F10X_HD
ADC_ConfigRegularChannel(adc_cfg->adc, channel, 1, ADC_SAMPLETIME_13CYCLES5);
ADC_StartCalibration(adc_cfg->adc);
@@ -220,7 +298,10 @@ static rt_err_t _adc_get_value(struct rt_adc_device *device, rt_uint32_t channel
}
ADC_EnableSoftwareStartConv(adc_cfg->adc);
-
+#elif APM32F40X
+ ADC_ConfigRegularChannel(adc_cfg->adc, channel, 1, ADC_SAMPLETIME_15CYCLES);
+ ADC_SoftwareStartConv(adc_cfg->adc);
+#endif
counter = 0;
while (!ADC_ReadStatusFlag(adc_cfg->adc, ADC_FLAG_EOC))
{
diff --git a/bsp/apm32/libraries/Drivers/drv_common.c b/bsp/apm32/libraries/Drivers/drv_common.c
index 3395028375..f6caf333db 100644
--- a/bsp/apm32/libraries/Drivers/drv_common.c
+++ b/bsp/apm32/libraries/Drivers/drv_common.c
@@ -91,7 +91,7 @@ void rt_hw_us_delay(rt_uint32_t us)
}
/**
- * This function will config the board for initialization.
+ * This function will initial STM32 board.
*/
RT_WEAK void rt_hw_board_init()
{
diff --git a/bsp/apm32/libraries/Drivers/drv_dac.c b/bsp/apm32/libraries/Drivers/drv_dac.c
index 6c49a47ea3..0091b8ada4 100644
--- a/bsp/apm32/libraries/Drivers/drv_dac.c
+++ b/bsp/apm32/libraries/Drivers/drv_dac.c
@@ -6,6 +6,7 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include
@@ -54,10 +55,13 @@ static rt_err_t _dac_enabled(struct rt_dac_device *device, rt_uint32_t channel)
{
GPIO_Config_T GPIO_ConfigStruct;
struct apm32_dac *cfg = (struct apm32_dac *)device->parent.user_data;
-
+#ifdef APM32F10X_HD
RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOA);
GPIO_ConfigStruct.mode = GPIO_MODE_ANALOG;
-
+#elif APM32F40X
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOA);
+ GPIO_ConfigStruct.mode = GPIO_MODE_AN;
+#endif
if (channel == 1)
{
GPIO_ConfigStruct.pin = GPIO_PIN_4;
diff --git a/bsp/apm32/libraries/Drivers/drv_gpio.c b/bsp/apm32/libraries/Drivers/drv_gpio.c
index d45f2f71b3..0aff55e26e 100644
--- a/bsp/apm32/libraries/Drivers/drv_gpio.c
+++ b/bsp/apm32/libraries/Drivers/drv_gpio.c
@@ -5,7 +5,8 @@
*
* Change Logs:
* Date Author Notes
- * 2020-08-20 Abbcc first version
+ * 2020-08-20 Abbcc first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include
@@ -172,6 +173,7 @@ static void _pin_mode(rt_device_t dev, rt_base_t pin, rt_base_t mode)
}
/* Configure gpioConfigure */
+#if defined(APM32F10X_HD)
gpioConfig.pin = PIN_APMPIN(pin);
gpioConfig.mode = GPIO_MODE_OUT_PP;
gpioConfig.speed = GPIO_SPEED_50MHz;
@@ -201,7 +203,43 @@ static void _pin_mode(rt_device_t dev, rt_base_t pin, rt_base_t mode)
/* output setting: od. */
gpioConfig.mode = GPIO_MODE_OUT_OD;
}
+#elif defined( APM32F40X)
+ gpioConfig.pin = PIN_APMPIN(pin);
+ gpioConfig.mode = GPIO_MODE_OUT;
+ gpioConfig.otype = GPIO_OTYPE_PP;
+ gpioConfig.speed = GPIO_SPEED_50MHz;
+ if (mode == PIN_MODE_OUTPUT)
+ {
+ /* output setting */
+ gpioConfig.mode = GPIO_MODE_OUT;
+ gpioConfig.otype = GPIO_OTYPE_PP;
+ }
+ else if (mode == PIN_MODE_INPUT)
+ {
+ /* input setting: not pull. */
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_NOPULL;
+ }
+ else if (mode == PIN_MODE_INPUT_PULLUP)
+ {
+ /* input setting: pull up. */
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_UP;
+ }
+ else if (mode == PIN_MODE_INPUT_PULLDOWN)
+ {
+ /* input setting: pull down. */
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_DOWN;
+ }
+ else if (mode == PIN_MODE_OUTPUT_OD)
+ {
+ /* output setting: od. */
+ gpioConfig.mode = GPIO_MODE_OUT;
+ gpioConfig.otype = GPIO_OTYPE_OD;
+ }
+#endif
GPIO_Config(PIN_APMPORT(pin), &gpioConfig);
}
@@ -335,15 +373,30 @@ static rt_err_t _pin_irq_enable(struct rt_device *device, rt_base_t pin,
gpioConfig.speed = GPIO_SPEED_50MHz;
switch (pin_irq_hdr_tab[irqindex].mode)
{
- case PIN_IRQ_MODE_RISING:
- gpioConfig.mode = GPIO_MODE_IN_PD;
- break;
- case PIN_IRQ_MODE_FALLING:
- gpioConfig.mode = GPIO_MODE_IN_PU;
- break;
- case PIN_IRQ_MODE_RISING_FALLING:
- gpioConfig.mode = GPIO_MODE_IN_FLOATING;
- break;
+ #if defined(APM32F10X_HD)
+ case PIN_IRQ_MODE_RISING:
+ gpioConfig.mode = GPIO_MODE_IN_PD;
+ break;
+ case PIN_IRQ_MODE_FALLING:
+ gpioConfig.mode = GPIO_MODE_IN_PU;
+ break;
+ case PIN_IRQ_MODE_RISING_FALLING:
+ gpioConfig.mode = GPIO_MODE_IN_FLOATING;
+ break;
+ #elif defined( APM32F40X)
+ case PIN_IRQ_MODE_RISING:
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_DOWN;
+ break;
+ case PIN_IRQ_MODE_FALLING:
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_UP;
+ break;
+ case PIN_IRQ_MODE_RISING_FALLING:
+ gpioConfig.mode = GPIO_MODE_IN;
+ gpioConfig.pupd = GPIO_PUPD_NOPULL;
+ break;
+ #endif
}
GPIO_Config(PIN_APMPORT(pin), &gpioConfig);
@@ -543,29 +596,64 @@ void EINT15_10_IRQHandler(void)
int rt_hw_pin_init(void)
{
-#ifdef GPIOA
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOA);
+#if defined(APM32F10X_HD)
+ #ifdef GPIOA
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOA);
+ #endif
+ #ifdef GPIOB
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOB);
+ #endif
+ #ifdef GPIOC
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOC);
+ #endif
+ #ifdef GPIOD
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOD);
+ #endif
+ #ifdef GPIOE
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOE);
+ #endif
+ #ifdef GPIOF
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOF);
+ #endif
+ #ifdef GPIOG
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOG);
+ #endif
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_AFIO);
+#elif defined(APM32F40X)
+ #ifdef GPIOA
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOA);
+ #endif
+ #ifdef GPIOB
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOB);
+ #endif
+ #ifdef GPIOC
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOC);
+ #endif
+ #ifdef GPIOD
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOD);
+ #endif
+ #ifdef GPIOE
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOE);
+ #endif
+ #ifdef GPIOF
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOF);
+ #endif
+ #ifdef GPIOG
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOG);
+ #endif
+ #ifdef GPIOH
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOH);
+ #endif
+ #ifdef GPIOI
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOI);
+ #endif
+ #ifdef GPIOJ
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOJ);
+ #endif
+ #ifdef GPIOK
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOK);
+ #endif
#endif
-#ifdef GPIOB
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOB);
-#endif
-#ifdef GPIOC
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOC);
-#endif
-#ifdef GPIOD
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOD);
-#endif
-#ifdef GPIOE
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOE);
-#endif
-#ifdef GPIOF
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOF);
-#endif
-#ifdef GPIOG
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_GPIOG);
-#endif
-
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_AFIO);
return rt_device_pin_register("pin", &_apm32_pin_ops, RT_NULL);
}
diff --git a/bsp/apm32/libraries/Drivers/drv_hwtimer.c b/bsp/apm32/libraries/Drivers/drv_hwtimer.c
index 3cf7d1580b..6bdd6e7500 100644
--- a/bsp/apm32/libraries/Drivers/drv_hwtimer.c
+++ b/bsp/apm32/libraries/Drivers/drv_hwtimer.c
@@ -6,6 +6,7 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include
@@ -59,6 +60,24 @@ enum
#ifdef BSP_USING_TMR8
TMR8_INDEX,
#endif
+#ifdef BSP_USING_TMR9
+ TMR9_INDEX,
+#endif
+#ifdef BSP_USING_TMR10
+ TMR10_INDEX,
+#endif
+#ifdef BSP_USING_TMR11
+ TMR11_INDEX,
+#endif
+#ifdef BSP_USING_TMR12
+ TMR12_INDEX,
+#endif
+#ifdef BSP_USING_TMR13
+ TMR13_INDEX,
+#endif
+#ifdef BSP_USING_TMR14
+ TMR14_INDEX,
+#endif
};
static struct apm32_timer tmr_config[] =
@@ -67,7 +86,11 @@ static struct apm32_timer tmr_config[] =
{
"timer1",
TMR1,
+ #ifdef APM32F10X_HD
TMR1_UP_IRQn,
+ #elif APM32F40X
+ TMR1_UP_TMR10_IRQn,
+ #endif
},
#endif
#ifdef BSP_USING_TMR2
@@ -102,7 +125,11 @@ static struct apm32_timer tmr_config[] =
{
"timer6",
TMR6,
+ #ifdef APM32F10X_HD
TMR6_IRQn,
+ #elif APM32F40X
+ TMR6_DAC_IRQn
+ #endif
},
#endif
#ifdef BSP_USING_TMR7
@@ -116,18 +143,71 @@ static struct apm32_timer tmr_config[] =
{
"timer8",
TMR8,
+ #ifdef APM32F10X_HD
TMR8_UP_IRQn,
+ #elif APM32F40X
+ TMR8_UP_TMR13_IRQn,
+ #endif
+ },
+#endif
+#ifdef BSP_USING_TMR9
+ {
+ "timer9",
+ TMR9,
+ TMR1_BRK_TMR9_IRQn,
+ },
+#endif
+#ifdef BSP_USING_TMR10
+ {
+ "timer10",
+ TMR10,
+ TMR1_UP_TMR10_IRQn,
+ },
+#endif
+#ifdef BSP_USING_TMR11
+ {
+ "timer11",
+ TMR11,
+ TMR1_TRG_COM_TMR11_IRQn,
+ },
+#endif
+#ifdef BSP_USING_TMR12
+ {
+ "timer12",
+ TMR12,
+ TMR8_BRK_TMR12_IRQn,
+ },
+#endif
+#ifdef BSP_USING_TMR13
+ {
+ "timer13",
+ TMR13,
+ TMR8_UP_TMR13_IRQn,
+ },
+#endif
+#ifdef BSP_USING_TMR14
+ {
+ "timer14",
+ TMR14,
+ TMR8_TRG_COM_TMR14_IRQn,
},
#endif
};
static rt_uint32_t _hwtimer_clock_get(TMR_T *tmr)
{
- uint32_t pclk1;
+ uint32_t pclk1,pclk2;
- RCM_ReadPCLKFreq(&pclk1, NULL);
+ RCM_ReadPCLKFreq(&pclk1, &pclk2);
- return (rt_uint32_t)(pclk1 * ((RCM->CFG_B.APB1PSC != RCM_APB_DIV_1) ? 2 : 1));
+ if (tmr == TMR1 || tmr== TMR8 || tmr== TMR9 || tmr== TMR10 || tmr== TMR11)
+ {
+ return (rt_uint32_t)(pclk2 * ((RCM->CFG_B.APB2PSC != RCM_APB_DIV_1) ? 2 : 1));
+ }
+ else
+ {
+ return (rt_uint32_t)(pclk1 * ((RCM->CFG_B.APB1PSC != RCM_APB_DIV_1) ? 2 : 1));
+ }
}
static void _hwtimer_init(struct rt_hwtimer_device *timer, rt_uint32_t state)
@@ -141,15 +221,7 @@ static void _hwtimer_init(struct rt_hwtimer_device *timer, rt_uint32_t state)
if (state)
{
timer_config = (struct apm32_timer *)timer->parent.user_data;
- if (timer_config->tmr == TMR1)
- {
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR1);
- }
- else if (timer_config->tmr == TMR8)
- {
- RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR8);
- }
- else if (timer_config->tmr == TMR2)
+ if (timer_config->tmr == TMR2)
{
RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_TMR2);
}
@@ -173,7 +245,50 @@ static void _hwtimer_init(struct rt_hwtimer_device *timer, rt_uint32_t state)
{
RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_TMR7);
}
-
+#ifdef APM32F10X_HD
+ else if (timer_config->tmr == TMR1)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR1);
+ }
+ else if (timer_config->tmr == TMR8)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR8);
+ }
+#endif
+#ifdef APM32F40X
+ else if (timer_config->tmr == TMR1)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR1);
+ }
+ else if (timer_config->tmr == TMR8)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR8);
+ }
+ else if (timer_config->tmr == TMR9)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR9);
+ }
+ else if (timer_config->tmr == TMR10)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR10);
+ }
+ else if (timer_config->tmr == TMR11)
+ {
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_TMR11);
+ }
+ else if (timer_config->tmr == TMR12)
+ {
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_TMR12);
+ }
+ else if (timer_config->tmr == TMR13)
+ {
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_TMR13);
+ }
+ else if (timer_config->tmr == TMR14)
+ {
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_TMR14);
+ }
+#endif
prescaler = (uint32_t)(_hwtimer_clock_get(timer_config->tmr) / 10000) - 1;
base_config.period = 10000 - 1;
@@ -227,8 +342,9 @@ static rt_err_t _hwtimer_start(rt_hwtimer_t *timer, rt_uint32_t t, rt_hwtimer_mo
TMR_EnableInterrupt(timer_config->tmr, TMR_INT_UPDATE);
- if (timer_config->tmr == TMR1 || timer_config->tmr == TMR8 || timer_config->tmr == TMR2 ||
- timer_config->tmr == TMR3 || timer_config->tmr == TMR4 || timer_config->tmr == TMR5)
+ if (timer_config->tmr == TMR1 || timer_config->tmr == TMR8 || timer_config->tmr == TMR2 || \
+ timer_config->tmr == TMR3 || timer_config->tmr == TMR4 || timer_config->tmr == TMR5 || \
+ timer_config->tmr == TMR9 || timer_config->tmr == TMR12)
{
if (timer_config->tmr->SMCTRL_B.SMFSEL != TMR_SLAVE_MODE_TRIGGER)
{
@@ -302,6 +418,7 @@ static const struct rt_hwtimer_ops _hwtimer_ops =
.control = _hwtimer_ctrl,
};
+#ifdef APM32F10X_HD
#ifdef BSP_USING_TMR1
void TMR1_UP_IRQHandler(void)
{
@@ -311,6 +428,26 @@ void TMR1_UP_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+#elif APM32F40X
+#if (defined(BSP_USING_TMR1) || defined(BSP_USING_TMR10))
+void TMR1_UP_TMR10_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ if (TMR_ReadIntFlag(TMR1, TMR_INT_UPDATE))
+ {
+ rt_device_hwtimer_isr(&tmr_config[TMR1_INDEX].device);
+ TMR_ClearIntFlag(TMR1, TMR_INT_UPDATE);
+ }
+ if (TMR_ReadIntFlag(TMR10, TMR_INT_UPDATE))
+ {
+ rt_device_hwtimer_isr(&tmr_config[TMR10_INDEX].device);
+ TMR_ClearIntFlag(TMR10, TMR_INT_UPDATE);
+ }
+ rt_interrupt_leave();
+}
+#endif
+#endif
+
#ifdef BSP_USING_TMR2
void TMR2_IRQHandler(void)
{
@@ -320,6 +457,7 @@ void TMR2_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
#ifdef BSP_USING_TMR3
void TMR3_IRQHandler(void)
{
@@ -329,6 +467,7 @@ void TMR3_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
#ifdef BSP_USING_TMR4
void TMR4_IRQHandler(void)
{
@@ -338,6 +477,7 @@ void TMR4_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
#ifdef BSP_USING_TMR5
void TMR5_IRQHandler(void)
{
@@ -347,8 +487,13 @@ void TMR5_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
#ifdef BSP_USING_TMR6
+#ifdef APM32F10X_HD
void TMR6_IRQHandler(void)
+#elif APM32F40X
+void TMR6_DAC_IRQHandler(void)
+#endif
{
rt_interrupt_enter();
rt_device_hwtimer_isr(&tmr_config[TMR6_INDEX].device);
@@ -356,6 +501,7 @@ void TMR6_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
#ifdef BSP_USING_TMR7
void TMR7_IRQHandler(void)
{
@@ -365,6 +511,8 @@ void TMR7_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+
+#ifdef APM32F10X_HD
#ifdef BSP_USING_TMR8
void TMR8_UP_IRQHandler(void)
{
@@ -374,6 +522,65 @@ void TMR8_UP_IRQHandler(void)
rt_interrupt_leave();
}
#endif
+#elif APM32F40X
+#if (defined(BSP_USING_TMR8) || defined(BSP_USING_TMR13))
+void TMR8_UP_TMR13_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ if (TMR_ReadIntFlag(TMR8, TMR_INT_UPDATE))
+ {
+ rt_device_hwtimer_isr(&tmr_config[TMR8_INDEX].device);
+ TMR_ClearIntFlag(TMR8, TMR_INT_UPDATE);
+ }
+ if (TMR_ReadIntFlag(TMR13, TMR_INT_UPDATE))
+ {
+ rt_device_hwtimer_isr(&tmr_config[TMR13_INDEX].device);
+ TMR_ClearIntFlag(TMR13, TMR_INT_UPDATE);
+ }
+ rt_interrupt_leave();
+}
+#endif
+#endif
+
+#ifdef BSP_USING_TMR9
+void TMR1_BRK_TMR9_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ rt_device_hwtimer_isr(&tmr_config[TMR9_INDEX].device);
+ TMR_ClearIntFlag(TMR9, TMR_INT_UPDATE);
+ rt_interrupt_leave();
+}
+#endif
+
+#ifdef BSP_USING_TMR11
+void TMR1_TRG_COM_TMR11_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ rt_device_hwtimer_isr(&tmr_config[TMR11_INDEX].device);
+ TMR_ClearIntFlag(TMR11, TMR_INT_UPDATE);
+ rt_interrupt_leave();
+}
+#endif
+
+#ifdef BSP_USING_TMR12
+void TMR8_BRK_TMR12_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ rt_device_hwtimer_isr(&tmr_config[TMR12_INDEX].device);
+ TMR_ClearIntFlag(TMR12, TMR_INT_UPDATE);
+ rt_interrupt_leave();
+}
+#endif
+
+#ifdef BSP_USING_TMR14
+void TMR8_TRG_COM_TMR14_IRQHandler(void)
+{
+ rt_interrupt_enter();
+ rt_device_hwtimer_isr(&tmr_config[TMR14_INDEX].device);
+ TMR_ClearIntFlag(TMR14, TMR_INT_UPDATE);
+ rt_interrupt_leave();
+}
+#endif
static int rt_hw_hwtimer_init(void)
{
diff --git a/bsp/apm32/libraries/Drivers/drv_pwm.c b/bsp/apm32/libraries/Drivers/drv_pwm.c
index 5ddef39e42..49ce486270 100644
--- a/bsp/apm32/libraries/Drivers/drv_pwm.c
+++ b/bsp/apm32/libraries/Drivers/drv_pwm.c
@@ -6,6 +6,7 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include
@@ -21,6 +22,7 @@
#define MIN_PERIOD 3
#define MIN_PULSE 2
+#ifdef APM32F10X_HD
#define _PWM_GPIO_INIT(port_num, pin_num) \
do \
{ \
@@ -30,6 +32,18 @@ do \
gpio_config->speed = GPIO_SPEED_50MHz; \
GPIO_Config(GPIO##port_num, gpio_config); \
} while (0)
+#elif APM32F40X
+#define _PWM_GPIO_INIT(port_num, pin_num) \
+do \
+{ \
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIO##port_num); \
+ gpio_config->pin = GPIO_PIN_##pin_num; \
+ gpio_config->mode = GPIO_MODE_AF; \
+ gpio_config->otype = GPIO_OTYPE_PP; \
+ gpio_config->speed = GPIO_SPEED_50MHz; \
+ GPIO_Config(GPIO##port_num, gpio_config); \
+} while (0)
+#endif
enum
{
@@ -51,6 +65,24 @@ enum
#ifdef BSP_USING_PWM8
PWM8_INDEX,
#endif
+#ifdef BSP_USING_PWM9
+ PWM9_INDEX,
+#endif
+#ifdef BSP_USING_PWM10
+ PWM10_INDEX,
+#endif
+#ifdef BSP_USING_PWM11
+ PWM11_INDEX,
+#endif
+#ifdef BSP_USING_PWM12
+ PWM12_INDEX,
+#endif
+#ifdef BSP_USING_PWM13
+ PWM13_INDEX,
+#endif
+#ifdef BSP_USING_PWM14
+ PWM14_INDEX,
+#endif
};
struct apm32_pwm
@@ -105,6 +137,48 @@ static struct apm32_pwm pwm_config[] =
0,
},
#endif
+#ifdef BSP_USING_PWM9
+ {
+ "pwm9",
+ TMR9,
+ 0,
+ },
+#endif
+#ifdef BSP_USING_PWM10
+ {
+ "pwm10",
+ TMR10,
+ 0,
+ },
+#endif
+#ifdef BSP_USING_PWM11
+ {
+ "pwm11",
+ TMR11,
+ 0,
+ },
+#endif
+#ifdef BSP_USING_PWM12
+ {
+ "pwm12",
+ TMR12,
+ 0,
+ },
+#endif
+#ifdef BSP_USING_PWM13
+ {
+ "pwm13",
+ TMR13,
+ 0,
+ },
+#endif
+#ifdef BSP_USING_PWM14
+ {
+ "pwm14",
+ TMR14,
+ 0,
+ },
+#endif
};
static void _pwm_channel_init(GPIO_Config_T *gpio_config)
@@ -141,6 +215,8 @@ static void _pwm_channel_init(GPIO_Config_T *gpio_config)
pwm_config[PWM2_INDEX].channel |= 1 << 3;
_PWM_GPIO_INIT(A, 3);
#endif
+
+#ifdef APM32F10X_HD
#ifdef BSP_USING_PWM3_CH1
pwm_config[PWM3_INDEX].channel |= 1 << 0;
GPIO_ConfigPinRemap(GPIO_FULL_REMAP_TMR3);
@@ -161,6 +237,29 @@ static void _pwm_channel_init(GPIO_Config_T *gpio_config)
GPIO_ConfigPinRemap(GPIO_FULL_REMAP_TMR3);
_PWM_GPIO_INIT(C, 9);
#endif
+#elif APM32F40X
+#ifdef BSP_USING_PWM3_CH1
+ pwm_config[PWM3_INDEX].channel |= 1 << 0;
+ GPIO_ConfigPinAF(GPIOC,GPIO_PIN_SOURCE_6,GPIO_AF_TMR3);
+ _PWM_GPIO_INIT(C, 6);
+#endif
+#ifdef BSP_USING_PWM3_CH2
+ pwm_config[PWM3_INDEX].channel |= 1 << 1;
+ GPIO_ConfigPinAF(GPIOC,GPIO_PIN_SOURCE_7,GPIO_AF_TMR3);
+ _PWM_GPIO_INIT(C, 7);
+#endif
+#ifdef BSP_USING_PWM3_CH3
+ pwm_config[PWM3_INDEX].channel |= 1 << 2;
+ GPIO_ConfigPinAF(GPIOC,GPIO_PIN_SOURCE_8,GPIO_AF_TMR3);
+ _PWM_GPIO_INIT(C, 8);
+#endif
+#ifdef BSP_USING_PWM3_CH4
+ pwm_config[PWM3_INDEX].channel |= 1 << 3;
+ GPIO_ConfigPinAF(GPIOC,GPIO_PIN_SOURCE_9,GPIO_AF_TMR3);
+ _PWM_GPIO_INIT(C, 9);
+#endif
+#endif
+
#ifdef BSP_USING_PWM4_CH1
pwm_config[PWM4_INDEX].channel |= 1 << 0;
_PWM_GPIO_INIT(B, 6);
@@ -209,6 +308,40 @@ static void _pwm_channel_init(GPIO_Config_T *gpio_config)
pwm_config[PWM8_INDEX].channel |= 1 << 3;
_PWM_GPIO_INIT(C, 9);
#endif
+#ifdef APM32F40X
+#ifdef BSP_USING_PWM9_CH1
+ pwm_config[PWM9_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(E, 5);
+#endif
+#ifdef BSP_USING_PWM9_CH2
+ pwm_config[PWM9_INDEX].channel |= 1 << 1;
+ _PWM_GPIO_INIT(E, 6);
+#endif
+#ifdef BSP_USING_PWM10_CH1
+ pwm_config[PWM10_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(F, 6);
+#endif
+#ifdef BSP_USING_PWM11_CH1
+ pwm_config[PWM11_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(F, 7);
+#endif
+#ifdef BSP_USING_PWM12_CH1
+ pwm_config[PWM9_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(H, 6);
+#endif
+#ifdef BSP_USING_PWM12_CH2
+ pwm_config[PWM9_INDEX].channel |= 1 << 1;
+ _PWM_GPIO_INIT(H, 9);
+#endif
+#ifdef BSP_USING_PWM13_CH1
+ pwm_config[PWM10_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(F, 8);
+#endif
+#ifdef BSP_USING_PWM14_CH1
+ pwm_config[PWM11_INDEX].channel |= 1 << 0;
+ _PWM_GPIO_INIT(F, 9);
+#endif
+#endif
}
static rt_err_t _pwm_hw_init(struct apm32_pwm *device)
diff --git a/bsp/apm32/libraries/Drivers/drv_rtc.c b/bsp/apm32/libraries/Drivers/drv_rtc.c
index 5eee7d29ed..59c553a8e9 100644
--- a/bsp/apm32/libraries/Drivers/drv_rtc.c
+++ b/bsp/apm32/libraries/Drivers/drv_rtc.c
@@ -6,6 +6,7 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include "board.h"
@@ -27,6 +28,7 @@
#define DRV_RTC_TIME_OUT 0xFFF
static rt_rtc_dev_t apm32_rtc_dev;
+static rt_uint8_t rtc_init_flag = RESET;
/**
* @brief This function will initialize the rtc on chip.
@@ -38,7 +40,12 @@ static rt_err_t _rtc_init(void)
volatile rt_uint32_t counter = 0;
/* Enable RTC Clock */
+#ifdef APM32F10X_HD
RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_PMU | RCM_APB1_PERIPH_BAKR);
+#elif APM32F40X
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_PMU);
+#endif
+
PMU_EnableBackupAccess();
/* Config RTC clock */
@@ -65,9 +72,9 @@ static rt_err_t _rtc_init(void)
#endif
RCM_EnableRTCCLK();
+ RTC_WaitForSynchro();
- RTC_WaitForSynchor();
-
+#ifdef APM32F10X_HD
counter = 0;
while (!RTC_ReadStatusFlag(RTC_FLAG_OC))
{
@@ -81,15 +88,26 @@ static rt_err_t _rtc_init(void)
RTC_ClearStatusFlag(RTC_FLAG_OVR | RTC_FLAG_ALR | RTC_FLAG_SEC);
-#ifdef BSP_RTC_USING_LSI
- RTC_ConfigPrescaler(LSI_VALUE - 1);
-#else
- RTC_ConfigPrescaler(LSE_VALUE - 1);
+ #ifdef BSP_RTC_USING_LSI
+ RTC_ConfigPrescaler(LSI_VALUE - 1);
+ #else
+ RTC_ConfigPrescaler(LSE_VALUE - 1);
+ #endif
+#elif APM32F40X
+ RTC_EnableInit();
+ RTC_Config_T rtcConfig;
+ RTC_ConfigStructInit(&rtcConfig);
+ RTC_Config(&rtcConfig);
#endif
+ if(!rtc_init_flag)
+ {
+ rtc_init_flag = SET;
+ }
return RT_EOK;
}
+#ifdef APM32F10X_HD
/**
* @brief This function will initialize the rtc on chip.
*
@@ -116,6 +134,11 @@ static rt_err_t _rtc_set_secs(void *args)
{
volatile rt_uint32_t counter = 0;
+ if(!rtc_init_flag)
+ {
+ _rtc_init();
+ }
+
while (!RTC_ReadStatusFlag(RTC_FLAG_OC))
{
if (++counter > DRV_RTC_TIME_OUT)
@@ -128,7 +151,83 @@ static rt_err_t _rtc_set_secs(void *args)
return RT_EOK;
}
+#elif APM32F40X
+static void get_rtc_timeval(struct timeval *tv)
+{
+ RTC_TimeConfig_T timeConfig;
+ RTC_DateConfig_T dateConfig;
+ struct tm tm_new = {0};
+ RTC_ReadTime(RTC_FORMAT_BIN, &timeConfig);
+ RTC_ReadDate(RTC_FORMAT_BIN, &dateConfig);
+
+ tm_new.tm_sec = timeConfig.seconds;
+ tm_new.tm_min = timeConfig.minutes;
+ tm_new.tm_hour = timeConfig.hours;
+ tm_new.tm_mday = dateConfig.date;
+ tm_new.tm_mon = dateConfig.month - 1;
+ tm_new.tm_year = dateConfig.year + 100;
+
+ tv->tv_sec = timegm(&tm_new);
+}
+
+static rt_err_t set_rtc_timeval(time_t time_stamp)
+{
+ RTC_TimeConfig_T timeConfig;
+ RTC_DateConfig_T dateConfig;
+ struct tm tm = {0};
+
+ if(!rtc_init_flag)
+ {
+ _rtc_init();
+ }
+
+ gmtime_r(&time_stamp, &tm);
+ if (tm.tm_year < 100)
+ {
+ return -RT_ERROR;
+ }
+
+ timeConfig.seconds = tm.tm_sec ;
+ timeConfig.minutes = tm.tm_min ;
+ timeConfig.hours = tm.tm_hour;
+ dateConfig.date = tm.tm_mday;
+ dateConfig.month = tm.tm_mon + 1 ;
+ dateConfig.year = tm.tm_year - 100;
+ dateConfig.weekday = tm.tm_wday + 1;
+
+ RTC_ConfigTime(RTC_FORMAT_BIN,&timeConfig);
+ RTC_ConfigDate(RTC_FORMAT_BIN,&dateConfig);
+
+ return RT_EOK;
+}
+
+/**
+ * @brief This function will initialize the rtc on chip.
+ *
+ * @return RT_EOK indicates successful initialize, other value indicates failed;
+ */
+static rt_err_t _rtc_get_secs(void *args)
+{
+ struct timeval tv;
+ get_rtc_timeval(&tv);
+ *(rt_uint32_t *) args = tv.tv_sec;
+
+ return RT_EOK;
+}
+
+static rt_err_t _rtc_set_secs(void *args)
+{
+ rt_err_t result = RT_EOK;
+
+ if (set_rtc_timeval(*(rt_uint32_t *)args))
+ {
+ result = -RT_ERROR;
+ }
+
+ return result;
+}
+#endif
static const struct rt_rtc_ops _rtc_ops =
{
diff --git a/bsp/apm32/libraries/Drivers/drv_spi.c b/bsp/apm32/libraries/Drivers/drv_spi.c
index db4dc734e0..e7ac69cd53 100644
--- a/bsp/apm32/libraries/Drivers/drv_spi.c
+++ b/bsp/apm32/libraries/Drivers/drv_spi.c
@@ -6,6 +6,7 @@
* Change Logs:
* Date Author Notes
* 2022-03-04 stevetong459 first version
+ * 2022-07-15 Aligagago add apm32F4 serie MCU support
*/
#include "drv_spi.h"
@@ -195,6 +196,7 @@ static int rt_hw_spi_init(void)
int result = 0;
GPIO_Config_T gpio_config;
+#ifdef APM32F10X_HD
#ifdef BSP_USING_SPI1
static struct rt_spi_bus spi_bus1;
spi_bus1.parent.user_data = (void *)SPI1;
@@ -256,6 +258,83 @@ static int rt_hw_spi_init(void)
gpio_config.speed = GPIO_SPEED_50MHz;
gpio_config.pin = GPIO_PIN_4;
GPIO_Config(GPIOB, &gpio_config);
+#endif
+#elif APM32F40X
+#ifdef BSP_USING_SPI1
+ static struct rt_spi_bus spi_bus1;
+ spi_bus1.parent.user_data = (void *)SPI1;
+
+ result = rt_spi_bus_register(&spi_bus1, "spi1", &_spi_ops);
+
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOA);
+ RCM_EnableAPB2PeriphClock(RCM_APB2_PERIPH_SPI1);
+
+ /* Config SPI1 PinAF */
+ GPIO_ConfigPinAF(GPIOA, GPIO_PIN_SOURCE_4, GPIO_AF_SPI1);
+ GPIO_ConfigPinAF(GPIOA, GPIO_PIN_SOURCE_5, GPIO_AF_SPI1);
+ GPIO_ConfigPinAF(GPIOA, GPIO_PIN_SOURCE_6, GPIO_AF_SPI1);
+ GPIO_ConfigPinAF(GPIOA, GPIO_PIN_SOURCE_7, GPIO_AF_SPI1);
+
+ /* SPI1_NSS(PA4) SPI1_SCK(PA5) SPI1_MISO(PA6) SPI1_MOSI(PA7) */
+ gpio_config.mode = GPIO_MODE_AF;
+ gpio_config.speed = GPIO_SPEED_100MHz;
+ gpio_config.otype = GPIO_OTYPE_PP;
+ gpio_config.pupd = GPIO_PUPD_NOPULL;
+ gpio_config.pin = GPIO_PIN_4 | GPIO_PIN_5 | GPIO_PIN_6 | GPIO_PIN_7;
+ GPIO_Config(GPIOA, &gpio_config);
+#endif
+
+#ifdef BSP_USING_SPI2
+ static struct rt_spi_bus spi_bus2;
+ spi_bus2.parent.user_data = (void *)SPI2;
+
+ result = rt_spi_bus_register(&spi_bus2, "spi2", &_spi_ops);
+
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOB);
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_SPI2);
+
+ /* Config SPI2 PinAF */
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_12, GPIO_AF_SPI2);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_13, GPIO_AF_SPI2);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_14, GPIO_AF_SPI2);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_15, GPIO_AF_SPI2);
+
+ /* SPI2_NSS(PB12) SPI2_SCK(PB13) SPI2_MISO(PB14) SPI2_MOSI(PB15) */
+ gpio_config.mode = GPIO_MODE_AF;
+ gpio_config.speed = GPIO_SPEED_100MHz;
+ gpio_config.otype = GPIO_OTYPE_PP;
+ gpio_config.pupd = GPIO_PUPD_NOPULL;
+ gpio_config.pin = GPIO_PIN_12 | GPIO_PIN_13 | GPIO_PIN_14 | GPIO_PIN_15;
+ GPIO_Config(GPIOB, &gpio_config);
+#endif
+
+#ifdef BSP_USING_SPI3
+ static struct rt_spi_bus spi_bus3;
+ spi_bus3.parent.user_data = (void *)SPI3;
+
+ result = rt_spi_bus_register(&spi_bus3, "spi3", &_spi_ops);
+
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOA);
+ RCM_EnableAHB1PeriphClock(RCM_AHB1_PERIPH_GPIOB);
+ RCM_EnableAPB1PeriphClock(RCM_APB1_PERIPH_SPI3);
+
+ /* Config SPI3 PinAF */
+ GPIO_ConfigPinAF(GPIOA, GPIO_PIN_SOURCE_4, GPIO_AF_SPI3);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_3, GPIO_AF_SPI3);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_4, GPIO_AF_SPI3);
+ GPIO_ConfigPinAF(GPIOB, GPIO_PIN_SOURCE_5, GPIO_AF_SPI3);
+
+ /* SPI3_SCK(PB3) SPI3_MISO(PB4) SPI3_MOSI(PB5) */
+ gpio_config.mode = GPIO_MODE_AF;
+ gpio_config.speed = GPIO_SPEED_100MHz;
+ gpio_config.otype = GPIO_OTYPE_PP;
+ gpio_config.pupd = GPIO_PUPD_NOPULL;
+ gpio_config.pin = GPIO_PIN_3 | GPIO_PIN_4 |GPIO_PIN_5;
+ GPIO_Config(GPIOB, &gpio_config);
+ /* SPI3_NSS(PA4) */
+ gpio_config.pin = GPIO_PIN_4;
+ GPIO_Config(GPIOA, &gpio_config);
+#endif
#endif
return result;
}
diff --git a/bsp/apm32/libraries/Drivers/drv_usart.c b/bsp/apm32/libraries/Drivers/drv_usart.c
index 174bd8ebba..d7d27de8a6 100644
--- a/bsp/apm32/libraries/Drivers/drv_usart.c
+++ b/bsp/apm32/libraries/Drivers/drv_usart.c
@@ -231,8 +231,6 @@ static void usart_isr(struct rt_serial_device *serial)
}
}
-
-
#if defined(BSP_USING_UART1)
void USART1_IRQHandler(void)
{
@@ -245,7 +243,6 @@ void USART1_IRQHandler(void)
rt_interrupt_leave();
}
-
#endif /* BSP_USING_UART1 */
#if defined(BSP_USING_UART2)
diff --git a/bsp/apm32/libraries/Kconfig b/bsp/apm32/libraries/Kconfig
index bb335315fa..8ebcd050fd 100644
--- a/bsp/apm32/libraries/Kconfig
+++ b/bsp/apm32/libraries/Kconfig
@@ -6,3 +6,7 @@ config SOC_SERIES_APM32F1
select ARCH_ARM_CORTEX_M3
select SOC_FAMILY_APM32
+config SOC_SERIES_APM32F4
+ bool
+ select ARCH_ARM_CORTEX_M4
+ select SOC_FAMILY_APM32
\ No newline at end of file