+ * The first argument is the "order" of the filter and is always one less than the desired length.
+ * The second argument is the normalized cutoff frequency. This is in the range 0 (DC) to 1.0 (Nyquist).
+ * A 6 kHz cutoff with a Nyquist frequency of 24 kHz lies at a normalized frequency of 6/24 = 0.25.
+ * The CMSIS FIR filter function requires the coefficients to be in time reversed order.
+ *
+ * where, x[n] is the input sequence, N is the number of input samples, and
+ * x' is the mean value of the input sequence, x[n].
+ * \par
+ * The mean value x' is defined as:
+ *
x' = (x[0] + x[1] + ... + x[n-1]) / N
+ *
+ * \par Block Diagram:
+ * \par
+ * \image html Variance.gif
+ *
+ *
+ * \par Variables Description:
+ * \par
+ * \li \c testInput_f32 points to the input data
+ * \li \c wire1, \c wir2, \c wire3 temporary buffers
+ * \li \c blockSize number of samples processed at a time
+ * \li \c refVarianceOut reference variance value
+ *
+ * \par CMSIS DSP Software Library Functions Used:
+ * \par
+ * - arm_dot_prod_f32()
+ * - arm_mult_f32()
+ * - arm_sub_f32()
+ * - arm_fill_f32()
+ * - arm_copy_f32()
+ *
+ * Refer
+ * \link arm_variance_example_f32.c \endlink
+ *
+ */
+
+
+/** \example arm_variance_example_f32.c
+ */
+#include
+#include "arm_math.h"
+
+/* ----------------------------------------------------------------------
+* Defines each of the tests performed
+* ------------------------------------------------------------------- */
+#define MAX_BLOCKSIZE 32
+#define DELTA (0.000001f)
+
+
+/* ----------------------------------------------------------------------
+* Declare I/O buffers
+* ------------------------------------------------------------------- */
+float32_t wire1[MAX_BLOCKSIZE];
+float32_t wire2[MAX_BLOCKSIZE];
+float32_t wire3[MAX_BLOCKSIZE];
+
+/* ----------------------------------------------------------------------
+* Test input data for Floating point Variance example for 32-blockSize
+* Generated by the MATLAB randn() function
+* ------------------------------------------------------------------- */
+
+float32_t testInput_f32[32] =
+{
+-0.432564811528221, -1.665584378238097, 0.125332306474831, 0.287676420358549,
+-1.146471350681464, 1.190915465642999, 1.189164201652103, -0.037633276593318,
+0.327292361408654, 0.174639142820925, -0.186708577681439, 0.725790548293303,
+-0.588316543014189, 2.183185818197101, -0.136395883086596, 0.113931313520810,
+1.066768211359189, 0.059281460523605, -0.095648405483669, -0.832349463650022,
+0.294410816392640, -1.336181857937804, 0.714324551818952, 1.623562064446271,
+-0.691775701702287, 0.857996672828263, 1.254001421602532, -1.593729576447477,
+-1.440964431901020, 0.571147623658178, -0.399885577715363, 0.689997375464345
+
+};
+
+/* ----------------------------------------------------------------------
+* Declare Global variables
+* ------------------------------------------------------------------- */
+uint32_t blockSize = 32;
+float32_t refVarianceOut = 0.903941793931839;
+
+/* ----------------------------------------------------------------------
+* Variance calculation test
+* ------------------------------------------------------------------- */
+
+int32_t main(void)
+{
+ arm_status status;
+ float32_t mean, oneByBlockSize;
+ float32_t variance;
+ float32_t diff;
+
+ status = ARM_MATH_SUCCESS;
+
+ /* Calculation of mean value of input */
+
+ /* x' = 1/blockSize * (x(0)* 1 + x(1) * 1 + ... + x(n-1) * 1) */
+
+ /* Fill wire1 buffer with 1.0 value */
+ arm_fill_f32(1.0, wire1, blockSize);
+
+ /* Calculate the dot product of wire1 and wire2 */
+ /* (x(0)* 1 + x(1) * 1 + ...+ x(n-1) * 1) */
+ arm_dot_prod_f32(testInput_f32, wire1, blockSize, &mean);
+
+ /* Calculation of 1/blockSize */
+ oneByBlockSize = 1.0 / (blockSize);
+
+ /* 1/blockSize * (x(0)* 1 + x(1) * 1 + ... + x(n-1) * 1) */
+ arm_mult_f32(&mean, &oneByBlockSize, &mean, 1);
+
+
+ /* Calculation of variance value of input */
+
+ /* (1/blockSize) * (x(0) - x') * (x(0) - x') + (x(1) - x') * (x(1) - x') + ... + (x(n-1) - x') * (x(n-1) - x') */
+
+ /* Fill wire2 with mean value x' */
+ arm_fill_f32(mean, wire2, blockSize);
+
+ /* wire3 contains (x-x') */
+ arm_sub_f32(testInput_f32, wire2, wire3, blockSize);
+
+ /* wire2 contains (x-x') */
+ arm_copy_f32(wire3, wire2, blockSize);
+
+ /* (x(0) - x') * (x(0) - x') + (x(1) - x') * (x(1) - x') + ... + (x(n-1) - x') * (x(n-1) - x') */
+ arm_dot_prod_f32(wire2, wire3, blockSize, &variance);
+
+ /* Calculation of 1/blockSize */
+ oneByBlockSize = 1.0 / (blockSize - 1);
+
+ /* Calculation of variance */
+ arm_mult_f32(&variance, &oneByBlockSize, &variance, 1);
+
+ /* absolute value of difference between ref and test */
+ diff = fabsf(refVarianceOut - variance);
+
+ /* Comparison of variance value with reference */
+ if(diff > DELTA)
+ {
+ status = ARM_MATH_TEST_FAILURE;
+ }
+
+ if( status != ARM_MATH_SUCCESS)
+ {
+ while(1);
+ }
+
+ while(1); /* main function does not return */
+}
+
+ /** \endlink */
+
diff --git a/CMSIS/DSP_Lib/Source/ARM/arm_cortexMx_math_Build.bat b/CMSIS/DSP_Lib/Source/ARM/arm_cortexMx_math_Build.bat
new file mode 100644
index 0000000..84515bf
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/ARM/arm_cortexMx_math_Build.bat
@@ -0,0 +1,29 @@
+
+SET TMP=C:\Temp
+SET TEMP=C:\Temp
+
+SET UVEXE=C:\Keil\UV4\UV4.EXE
+
+@echo Building DSP Library for Cortex-M0 Little Endian
+%UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 LE" -o"DSP_Lib CM0 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M0 Big Endian
+%UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 BE" -o"DSP_Lib CM0 BE.txt" -j0
+
+@echo Building DSP Library for Cortex-M3 Little Endian
+%UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 LE" -o"DSP_Lib CM3 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M3 Big Endian
+%UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 BE" -o"DSP_Lib CM3 BE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE" -o"DSP_Lib CM4 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 Big Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 BE" -o"DSP_Lib CM4 BE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 with FPU Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE FPU" -o"DSP_Lib CM4 LE FPU.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 with FPU Big Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 BE FPU" -o"DSP_Lib CM4 BE FPU.txt" -j0
\ No newline at end of file
diff --git a/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_f32.c b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_f32.c
new file mode 100644
index 0000000..bccb09b
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/BasicMathFunctions/arm_abs_f32.c
@@ -0,0 +1,158 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:56a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_abs_f32.c
+*
+* Description: Vector absolute value.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include
+
+/**
+ * @ingroup groupMath
+ */
+
+/**
+ * @defgroup BasicAbs Vector Absolute Value
+ *
+ * Computes the absolute value of a vector on an element-by-element basis.
+ *
+ *
+ *
+ * In the fixed-point Q7, Q15, and Q31 functions, scale is represented by
+ * a fractional multiplication scaleFract and an arithmetic shift shift.
+ * The shift allows the gain of the scaling operation to exceed 1.0.
+ * The algorithm used with fixed-point data is:
+ *
+ *
+ * where b1x and a1x are the coefficients for the first stage,
+ * b2x and a2x are the coefficients for the second stage,
+ * and so on. The pCoeffs array contains a total of 5*numStages values.
+ *
+ * \par
+ * The pState points to state variables array and size of each state variable is 1.63 format.
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ * The state variables are arranged in the state array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cas_df1_32x64_init_q31(
+ arm_biquad_cas_df1_32x64_ins_q31 * S,
+ uint8_t numStages,
+ q31_t * pCoeffs,
+ q63_t * pState,
+ uint8_t postShift)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign postShift to be applied to the output */
+ S->postShift = postShift;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q63_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ * @} end of BiquadCascadeDF1_32x64 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
new file mode 100644
index 0000000..9f6a221
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_32x64_q31.c
@@ -0,0 +1,552 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_32x64_q31.c
+*
+* Description: High precision Q31 Biquad cascade filter processing function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup BiquadCascadeDF1_32x64 High Precision Q31 Biquad Cascade Filter
+ *
+ * This function implements a high precision Biquad cascade filter which operates on
+ * Q31 data values. The filter coefficients are in 1.31 format and the state variables
+ * are in 1.63 format. The double precision state variables reduce quantization noise
+ * in the filter and provide a cleaner output.
+ * These filters are particularly useful when implementing filters in which the
+ * singularities are close to the unit circle. This is common for low pass or high
+ * pass filters with very low cutoff frequencies.
+ *
+ * The function operates on blocks of input and output data
+ * and each call to the function processes blockSize samples through
+ * the filter. pSrc and pDst points to input and output arrays
+ * containing blockSize Q31 values.
+ *
+ * \par Algorithm
+ * Each Biquad stage implements a second order filter using the difference equation:
+ *
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+ * \image html Biquad.gif "Single Biquad filter stage"
+ * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ * Pay careful attention to the sign of the feedback coefficients.
+ * Some design tools use the difference equation
+ *
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
+ *
+ * \par
+ * Higher order filters are realized as a cascade of second order sections.
+ * numStages refers to the number of second order stages used.
+ * For example, an 8th order filter would be realized with numStages=4 second order stages.
+ * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+ * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+ *
+ * \par
+ * The pState points to state variables array .
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2] and each state variable in 1.63 format to improve precision.
+ * The state variables are arranged in the array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ *
+ * \par
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values of data in 1.63 format.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ *
+ * \par Init Function
+ * There is also an associated initialization function which performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * For example, to statically initialize the filter instance structure use
+ *
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer;
+ * pCoeffs is the address of the coefficient buffer; postShift shift to be applied which is described in detail below.
+ * \par Fixed-Point Behavior
+ * Care must be taken while using Biquad Cascade 32x64 filter function.
+ * Following issues must be considered:
+ * - Scaling of coefficients
+ * - Filter gain
+ * - Overflow and saturation
+ *
+ * \par
+ * Filter coefficients are represented as fractional values and
+ * restricted to lie in the range [-1 +1).
+ * The processing function has an additional scaling parameter postShift
+ * which allows the filter coefficients to exceed the range [+1 -1).
+ * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+ * This essentially scales the filter coefficients by 2^postShift.
+ * For example, to realize the coefficients
+ *
+ * {1.5, -0.8, 1.2, 1.6, -0.9}
+ *
+ * set the Coefficient array to:
+ *
+ * {0.75, -0.4, 0.6, 0.8, -0.45}
+ *
+ * and set postShift=1
+ *
+ * \par
+ * The second thing to keep in mind is the gain through the filter.
+ * The frequency response of a Biquad filter is a function of its coefficients.
+ * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
+ * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
+ * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+ *
+ * \par
+ * The third item to consider is the overflow and saturation behavior of the fixed-point Q31 version.
+ * This is described in the function specific documentation below.
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1_32x64
+ * @{
+ */
+
+/**
+ * @details
+
+ * @param[in] *S points to an instance of the high precision Q31 Biquad cascade filter.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clip.
+ * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+ * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
+ * 1.31 format by discarding the low 32 bits.
+ *
+ * \par
+ * Two related functions are provided in the CMSIS DSP library.
+ * arm_biquad_cascade_df1_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q63 accumulator.
+ * arm_biquad_cascade_df1_fast_q31() implements a Biquad cascade with 32-bit coefficients and state variables with a Q31 accumulator.
+ */
+
+void arm_biquad_cas_df1_32x64_q31(
+ const arm_biquad_cas_df1_32x64_ins_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pIn = pSrc; /* input pointer initialization */
+ q31_t *pOut = pDst; /* output pointer initialization */
+ q63_t *pState = S->pState; /* state pointer initialization */
+ q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
+ q63_t acc; /* accumulator */
+ q31_t Xn1, Xn2; /* Input Filter state variables */
+ q63_t Yn1, Yn2; /* Output Filter state variables */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t Xn; /* temporary input */
+ int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+ q31_t acc_l, acc_h; /* temporary output */
+ uint32_t uShift = ((uint32_t) S->postShift + 1u);
+ uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = (q31_t) (pState[0]);
+ Xn2 = (q31_t) (pState[1]);
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* Apply loop unrolling and compute 4 output values simultaneously. */
+ /* The variable acc hold output value that is being computed and
+ * stored in the destination buffer
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) Xn *b0;
+
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) Xn1 *b1;
+
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn2 *b2;
+
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn1, a1);
+
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn2, a2);
+
+ /* The result is converted to 1.63 , Yn2 variable is reused */
+ Yn2 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ *pOut = acc_h;
+
+ /* Read the second input into Xn2, to reuse the value */
+ Xn2 = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc += b1 * x[n-1] */
+ acc = (q63_t) Xn *b1;
+
+ /* acc = b0 * x[n] */
+ acc += (q63_t) Xn2 *b0;
+
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn1 *b2;
+
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn2, a1);
+
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn1, a2);
+
+ /* The result is converted to 1.63, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Read the third input into Xn1, to reuse the value */
+ Xn1 = *pIn++;
+
+ /* The result is converted to 1.31 */
+ /* Store the output in the destination buffer. */
+ *(pOut + 1u) = acc_h;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) Xn1 *b0;
+
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) Xn2 *b1;
+
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn *b2;
+
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn1, a1);
+
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn2, a2);
+
+ /* The result is converted to 1.63, Yn2 variable is reused */
+ Yn2 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ *(pOut + 2u) = acc_h;
+
+ /* Read the fourth input into Xn, to reuse the value */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q63_t) Xn *b0;
+
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) Xn1 *b1;
+
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn2 *b2;
+
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn2, a1);
+
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn1, a2);
+
+ /* The result is converted to 1.63, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ *(pOut + 3u) = acc_h;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+
+ /* update output pointer */
+ pOut += 4u;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ sample = (blockSize & 0x3u);
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) Xn *b0;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) Xn1 *b1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn2 *b2;
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn1, a1);
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn2, a2);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ /* The result is converted to 1.63, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ *pOut++ = acc_h;
+ //Yn1 = acc << shift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+// *pOut++ = (q31_t) (acc >> (32 - shift));
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage output is given as input to the second stage. */
+ pIn = pDst;
+
+ /* Reset to destination buffer working pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ /* Store the updated state variables back into the pState array */
+ *pState++ = (q63_t) Xn1;
+ *pState++ = (q63_t) Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* The variable acc hold output value that is being computed and
+ * stored in the destination buffer
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize;
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q63_t) Xn *b0;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) Xn1 *b1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) Xn2 *b2;
+ /* acc += a1 * y[n-1] */
+ acc += mult32x64(Yn1, a1);
+ /* acc += a2 * y[n-2] */
+ acc += mult32x64(Yn2, a2);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+
+ /* The result is converted to 1.63, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc_h = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ *pOut++ = acc_h;
+
+ //Yn1 = acc << shift;
+
+ /* Store the output in the destination buffer in 1.31 format. */
+ //*pOut++ = (q31_t) (acc >> (32 - shift));
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage output is given as input to the second stage. */
+ pIn = pDst;
+
+ /* Reset to destination buffer working pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = (q63_t) Xn1;
+ *pState++ = (q63_t) Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+}
+
+ /**
+ * @} end of BiquadCascadeDF1_32x64 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
new file mode 100644
index 0000000..2e8d700
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_f32.c
@@ -0,0 +1,420 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_f32.c
+*
+* Description: Processing function for the
+* floating-point Biquad cascade DirectFormI(DF1) filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup BiquadCascadeDF1 Biquad Cascade IIR Filters Using Direct Form I Structure
+ *
+ * This set of functions implements arbitrary order recursive (IIR) filters.
+ * The filters are implemented as a cascade of second order Biquad sections.
+ * The functions support Q15, Q31 and floating-point data types.
+ * Fast version of Q15 and Q31 also supported on CortexM4 and Cortex-M3.
+ *
+ * The functions operate on blocks of input and output data and each call to the function
+ * processes blockSize samples through the filter.
+ * pSrc points to the array of input data and
+ * pDst points to the array of output data.
+ * Both arrays contain blockSize values.
+ *
+ * \par Algorithm
+ * Each Biquad stage implements a second order filter using the difference equation:
+ *
+ * A Direct Form I algorithm is used with 5 coefficients and 4 state variables per stage.
+ * \image html Biquad.gif "Single Biquad filter stage"
+ * Coefficients b0, b1 and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ * Pay careful attention to the sign of the feedback coefficients.
+ * Some design tools use the difference equation
+ *
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
+ *
+ * \par
+ * Higher order filters are realized as a cascade of second order sections.
+ * numStages refers to the number of second order stages used.
+ * For example, an 8th order filter would be realized with numStages=4 second order stages.
+ * \image html BiquadCascade.gif "8th order filter using a cascade of Biquad stages"
+ * A 9th order filter would be realized with numStages=5 second order stages with the coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+ *
+ * \par
+ * The pState points to state variables array.
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ * The state variables are arranged in the pState array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ *
+ * \par
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values.
+ * The state variables are updated after each block of data is processed, the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Init Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ *
+ * where numStages is the number of Biquad stages in the filter; pState is the address of the state buffer;
+ * pCoeffs is the address of the coefficient buffer; postShift shift to be applied.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the Q15 and Q31 versions of the Biquad Cascade filter functions.
+ * Following issues must be considered:
+ * - Scaling of coefficients
+ * - Filter gain
+ * - Overflow and saturation
+ *
+ * \par
+ * Scaling of coefficients:
+ * Filter coefficients are represented as fractional values and
+ * coefficients are restricted to lie in the range [-1 +1).
+ * The fixed-point functions have an additional scaling parameter postShift
+ * which allow the filter coefficients to exceed the range [+1 -1).
+ * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ * \image html BiquadPostshift.gif "Fixed-point Biquad with shift by postShift bits after accumulator"
+ * This essentially scales the filter coefficients by 2^postShift.
+ * For example, to realize the coefficients
+ *
+ * {1.5, -0.8, 1.2, 1.6, -0.9}
+ *
+ * set the pCoeffs array to:
+ *
+ * {0.75, -0.4, 0.6, 0.8, -0.45}
+ *
+ * and set postShift=1
+ *
+ * \par
+ * Filter gain:
+ * The frequency response of a Biquad filter is a function of its coefficients.
+ * It is possible for the gain through the filter to exceed 1.0 meaning that the filter increases the amplitude of certain frequencies.
+ * This means that an input signal with amplitude < 1.0 may result in an output > 1.0 and these are saturated or overflowed based on the implementation of the filter.
+ * To avoid this behavior the filter needs to be scaled down such that its peak gain < 1.0 or the input signal must be scaled down so that the combination of input and filter are never overflowed.
+ *
+ * \par
+ * Overflow and saturation:
+ * For Q15 and Q31 versions, it is described separately as part of the function specific documentation below.
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @param[in] *S points to an instance of the floating-point Biquad cascade structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ */
+
+void arm_biquad_cascade_df1_f32(
+ const arm_biquad_casd_df1_inst_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pIn = pSrc; /* source pointer */
+ float32_t *pOut = pDst; /* destination pointer */
+ float32_t *pState = S->pState; /* pState pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* coefficient pointer */
+ float32_t acc; /* Simulates the accumulator */
+ float32_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ float32_t Xn1, Xn2, Yn1, Yn2; /* Filter pState variables */
+ float32_t Xn; /* temporary input */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the pState values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* Apply loop unrolling and compute 4 output values simultaneously. */
+ /* The variable acc hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(sample > 0u)
+ {
+ /* Read the first input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ Yn2 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = Yn2;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+
+ /* Read the second input */
+ Xn2 = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ Yn1 = (b0 * Xn2) + (b1 * Xn) + (b2 * Xn1) + (a1 * Yn2) + (a2 * Yn1);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = Yn1;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+
+ /* Read the third input */
+ Xn1 = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ Yn2 = (b0 * Xn1) + (b1 * Xn2) + (b2 * Xn) + (a1 * Yn1) + (a2 * Yn2);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = Yn2;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+
+ /* Read the forth input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ Yn1 = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn2) + (a2 * Yn1);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = Yn1;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+
+ /* decrement the loop counter */
+ sample--;
+
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ sample = blockSize & 0x3u;
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = acc;
+
+ /* decrement the loop counter */
+ sample--;
+
+ }
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent numStages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset the output pointer */
+ pOut = pDst;
+
+ /* decrement the loop counter */
+ stage--;
+
+ } while(stage > 0u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the pState values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* The variables acc holds the output value that is computed:
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize;
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ acc = (b0 * Xn) + (b1 * Xn1) + (b2 * Xn2) + (a1 * Yn1) + (a2 * Yn2);
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = acc;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent numStages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset the output pointer */
+ pOut = pDst;
+
+ /* decrement the loop counter */
+ stage--;
+
+ } while(stage > 0u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+
+ /**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
new file mode 100644
index 0000000..7c82896
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q15.c
@@ -0,0 +1,282 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_fast_q15.c
+*
+* Description: Fast processing function for the
+* Q15 Biquad cascade filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.9 2010/08/16
+* Initial version
+*
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @details
+ * @param[in] *S points to an instance of the Q15 Biquad cascade structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25).
+ * The 2.30 accumulator is then shifted by postShift bits and the result truncated to 1.15 format by discarding the low 16 bits.
+ *
+ * \par
+ * Refer to the function arm_biquad_cascade_df1_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_biquad_cascade_df1_init_q15() to initialize the filter structure.
+ *
+ */
+
+void arm_biquad_cascade_df1_fast_q15(
+ const arm_biquad_casd_df1_inst_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q31_t in; /* Temporary variable to hold input value */
+ q31_t out; /* Temporary variable to hold output value */
+ q31_t b0; /* Temporary variable to hold bo value */
+ q31_t b1, a1; /* Filter coefficients */
+ q31_t state_in, state_out; /* Filter state variables */
+ q31_t acc; /* Accumulator */
+ int32_t shift = (int32_t) (15 - S->postShift); /* Post shift */
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ uint32_t sample, stage = S->numStages; /* Stage loop counter */
+
+
+
+ do
+ {
+
+ /* Read the b0 and 0 coefficients using SIMD */
+ b0 = *__SIMD32(pCoeffs)++;
+
+ /* Read the b1 and b2 coefficients using SIMD */
+ b1 = *__SIMD32(pCoeffs)++;
+
+ /* Read the a1 and a2 coefficients using SIMD */
+ a1 = *__SIMD32(pCoeffs)++;
+
+ /* Read the input state values from the state buffer: x[n-1], x[n-2] */
+ state_in = *__SIMD32(pState)++;
+
+ /* Read the output state values from the state buffer: y[n-1], y[n-2] */
+ state_out = *__SIMD32(pState)--;
+
+ /* Apply loop unrolling and compute 2 output values simultaneously. */
+ /* The variable acc hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+ sample = blockSize >> 1u;
+
+ /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
+ ** a second loop below computes the remaining 1 sample. */
+ while(sample > 0u)
+ {
+
+ /* Read the input */
+ in = *__SIMD32(pIn)++;
+
+ /* out = b0 * x[n] + 0 * 0 */
+ out = __SMUAD(b0, in);
+ /* acc = b1 * x[n-1] + acc += b2 * x[n-2] + out */
+ acc = __SMLAD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
+ acc = __SMLAD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 and then saturation is applied */
+ out = __SSAT((acc >> shift), 16);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+ state_out = __PKHBT(state_out >> 16, (out), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* out = b0 * x[n] + 0 * 0 */
+ out = __SMUADX(b0, in);
+ /* acc0 = b1 * x[n-1] , acc0 += b2 * x[n-2] + out */
+ acc = __SMLAD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
+ acc = __SMLAD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 and then saturation is applied */
+ out = __SSAT((acc >> shift), 16);
+
+
+ /* Store the output in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
+
+#else
+
+ *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in >> 16, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, in, 16);
+ state_out = __PKHBT(state_out >> 16, out, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+ /* Decrement the loop counter */
+ sample--;
+
+ }
+
+ /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ if((blockSize & 0x1u) != 0u)
+ {
+ /* Read the input */
+ in = *pIn++;
+
+ /* out = b0 * x[n] + 0 * 0 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out = __SMUAD(b0, in);
+
+#else
+
+ out = __SMUADX(b0, in);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc = b1 * x[n-1], acc += b2 * x[n-2] + out */
+ acc = __SMLAD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + acc += a2 * y[n-2] */
+ acc = __SMLAD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 and then saturation is applied */
+ out = __SSAT((acc >> shift), 16);
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = (q15_t) out;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, in, 16);
+ state_out = __PKHBT(state_out >> 16, out, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ }
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent (numStages - 1) occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset the output pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the state array */
+ *__SIMD32(pState)++ = state_in;
+ *__SIMD32(pState)++ = state_out;
+
+
+ /* Decrement the loop counter */
+ stage--;
+
+ } while(stage > 0u);
+}
+
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
new file mode 100644
index 0000000..863e513
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_fast_q31.c
@@ -0,0 +1,274 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_fast_q31.c
+*
+* Description: Processing function for the
+* Q31 Fast Biquad cascade DirectFormI(DF1) filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.9 2010/08/27
+* Initial version
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @details
+ *
+ * @param[in] *S points to an instance of the Q31 Biquad cascade structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ * These intermediate results are added to a 2.30 accumulator.
+ * Finally, the accumulator is saturated and converted to a 1.31 result.
+ * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signal must be scaled down by two bits and lie in the range [-0.25 +0.25). Use the intialization function
+ * arm_biquad_cascade_df1_init_q31() to initialize filter structure.
+ *
+ * \par
+ * Refer to the function arm_biquad_cascade_df1_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision. Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_biquad_cascade_df1_init_q31() to initialize the filter structure.
+ */
+
+void arm_biquad_cascade_df1_fast_q31(
+ const arm_biquad_casd_df1_inst_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t acc; /* accumulator */
+ q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t *pIn = pSrc; /* input pointer initialization */
+ q31_t *pOut = pDst; /* output pointer initialization */
+ q31_t *pState = S->pState; /* pState pointer initialization */
+ q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
+ q31_t Xn; /* temporary input */
+ int32_t shift = (int32_t) S->postShift + 1; /* Shift to be applied to the output */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* Apply loop unrolling and compute 4 output values simultaneously. */
+ /* The variables acc ... acc3 hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) (((q63_t) b1 * Xn1) >> 32);
+ /* acc += b1 * x[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b0 * (Xn))) >> 32);
+ /* acc += b[2] * x[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
+ /* acc += a1 * y[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
+ /* acc += a2 * y[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
+
+ /* The result is converted to 1.31 , Yn2 variable is reused */
+ Yn2 = acc << shift;
+
+ /* Read the second input */
+ Xn2 = *(pIn + 1u);
+
+ /* Store the output in the destination buffer. */
+ *pOut = Yn2;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) (((q63_t) b0 * (Xn2)) >> 32);
+ /* acc += b1 * x[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn))) >> 32);
+ /* acc += b[2] * x[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn1))) >> 32);
+ /* acc += a1 * y[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);
+ /* acc += a2 * y[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);
+
+ /* The result is converted to 1.31, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Read the third input */
+ Xn1 = *(pIn + 2u);
+
+ /* Store the output in the destination buffer. */
+ *(pOut + 1u) = Yn1;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) (((q63_t) b0 * (Xn1)) >> 32);
+ /* acc += b1 * x[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn2))) >> 32);
+ /* acc += b[2] * x[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn))) >> 32);
+ /* acc += a1 * y[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
+ /* acc += a2 * y[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
+
+ /* The result is converted to 1.31, Yn2 variable is reused */
+ Yn2 = acc << shift;
+
+ /* Read the forth input */
+ Xn = *(pIn + 3u);
+
+ /* Store the output in the destination buffer. */
+ *(pOut + 2u) = Yn2;
+ pIn += 4u;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);
+ /* acc += b1 * x[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);
+ /* acc += b[2] * x[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
+ /* acc += a1 * y[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn2))) >> 32);
+ /* acc += a2 * y[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn1))) >> 32);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ Xn2 = Xn1;
+
+ /* The result is converted to 1.31, Yn1 variable is reused */
+ Yn1 = acc << shift;
+
+ /* Xn1 = Xn */
+ Xn1 = Xn;
+
+ /* Store the output in the destination buffer. */
+ *(pOut + 3u) = Yn1;
+ pOut += 4u;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ sample = (blockSize & 0x3u);
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) (((q63_t) b0 * (Xn)) >> 32);
+ /* acc += b1 * x[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b1 * (Xn1))) >> 32);
+ /* acc += b[2] * x[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) b2 * (Xn2))) >> 32);
+ /* acc += a1 * y[n-1] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a1 * (Yn1))) >> 32);
+ /* acc += a2 * y[n-2] */
+ acc = (q31_t) ((((q63_t) acc << 32) + ((q63_t) a2 * (Yn2))) >> 32);
+ /* The result is converted to 1.31 */
+ acc = acc << shift;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = acc;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = acc;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent stages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset to destination pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+}
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
new file mode 100644
index 0000000..6334e42
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_f32.c
@@ -0,0 +1,106 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_init_f32.c
+*
+* Description: floating-point Biquad cascade DirectFormI(DF1) filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @details
+ * @brief Initialization function for the floating-point Biquad cascade filter.
+ * @param[in,out] *S points to an instance of the floating-point Biquad cascade structure.
+ * @param[in] numStages number of 2nd order stages in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients array.
+ * @param[in] *pState points to the state array.
+ * @return none
+ *
+ *
+ * Coefficient and State Ordering:
+ *
+ * \par
+ * The coefficients are stored in the array pCoeffs in the following order:
+ *
+ *
+ * \par
+ * where b1x and a1x are the coefficients for the first stage,
+ * b2x and a2x are the coefficients for the second stage,
+ * and so on. The pCoeffs array contains a total of 5*numStages values.
+ *
+ * \par
+ * The pState is a pointer to state array.
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ * The state variables are arranged in the pState array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ *
+ */
+
+void arm_biquad_cascade_df1_init_f32(
+ arm_biquad_casd_df1_inst_f32 * S,
+ uint8_t numStages,
+ float32_t * pCoeffs,
+ float32_t * pState)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
new file mode 100644
index 0000000..c959d72
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q15.c
@@ -0,0 +1,108 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_init_q15.c
+*
+* Description: Q15 Biquad cascade DirectFormI(DF1) filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the Q15 Biquad cascade structure.
+ * @param[in] numStages number of 2nd order stages in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] postShift Shift to be applied to the accumulator result. Varies according to the coefficients format
+ * @return none
+ *
+ * Coefficient and State Ordering:
+ *
+ * \par
+ * The coefficients are stored in the array pCoeffs in the following order:
+ *
+ * where b1x and a1x are the coefficients for the first stage,
+ * b2x and a2x are the coefficients for the second stage,
+ * and so on. The pCoeffs array contains a total of 6*numStages values.
+ * The zero coefficient between b1 and b2 facilities use of 16-bit SIMD instructions on the Cortex-M4.
+ *
+ * \par
+ * The state variables are stored in the array pState.
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ * The state variables are arranged in the pState array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_df1_init_q15(
+ arm_biquad_casd_df1_inst_q15 * S,
+ uint8_t numStages,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ int8_t postShift)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign postShift to be applied to the output */
+ S->postShift = postShift;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
new file mode 100644
index 0000000..3174fd5
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_init_q31.c
@@ -0,0 +1,108 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_init_q31.c
+*
+* Description: Q31 Biquad cascade DirectFormI(DF1) filter initialization function.
+*
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the Q31 Biquad cascade structure.
+ * @param[in] numStages number of 2nd order stages in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] postShift Shift to be applied after the accumulator. Varies according to the coefficients format
+ * @return none
+ *
+ * Coefficient and State Ordering:
+ *
+ * \par
+ * The coefficients are stored in the array pCoeffs in the following order:
+ *
+ * where b1x and a1x are the coefficients for the first stage,
+ * b2x and a2x are the coefficients for the second stage,
+ * and so on. The pCoeffs array contains a total of 5*numStages values.
+ *
+ * \par
+ * The pState points to state variables array.
+ * Each Biquad stage has 4 state variables x[n-1], x[n-2], y[n-1], and y[n-2].
+ * The state variables are arranged in the pState array as:
+ *
+ * {x[n-1], x[n-2], y[n-1], y[n-2]}
+ *
+ * The 4 state variables for stage 1 are first, then the 4 state variables for stage 2, and so on.
+ * The state array has a total length of 4*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_df1_init_q31(
+ arm_biquad_casd_df1_inst_q31 * S,
+ uint8_t numStages,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ int8_t postShift)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign postShift to be applied to the output */
+ S->postShift = postShift;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 4 * numStages */
+ memset(pState, 0, (4u * (uint32_t) numStages) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
new file mode 100644
index 0000000..69c42b6
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q15.c
@@ -0,0 +1,407 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_q15.c
+*
+* Description: Processing function for the
+* Q15 Biquad cascade DirectFormI(DF1) filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 Biquad cascade filter.
+ * @param[in] *S points to an instance of the Q15 Biquad cascade structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * The accumulator is then shifted by postShift bits to truncate the result to 1.15 format by discarding the low 16 bits.
+ * Finally, the result is saturated to 1.15 format.
+ *
+ * \par
+ * Refer to the function arm_biquad_cascade_df1_fast_q15() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ */
+
+void arm_biquad_cascade_df1_q15(
+ const arm_biquad_casd_df1_inst_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q31_t in; /* Temporary variable to hold input value */
+ q31_t out; /* Temporary variable to hold output value */
+ q31_t b0; /* Temporary variable to hold bo value */
+ q31_t b1, a1; /* Filter coefficients */
+ q31_t state_in, state_out; /* Filter state variables */
+ q31_t acc_l, acc_h;
+ q63_t acc; /* Accumulator */
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
+ int32_t uShift = (32 - lShift);
+
+ do
+ {
+ /* Read the b0 and 0 coefficients using SIMD */
+ b0 = *__SIMD32(pCoeffs)++;
+
+ /* Read the b1 and b2 coefficients using SIMD */
+ b1 = *__SIMD32(pCoeffs)++;
+
+ /* Read the a1 and a2 coefficients using SIMD */
+ a1 = *__SIMD32(pCoeffs)++;
+
+ /* Read the input state values from the state buffer: x[n-1], x[n-2] */
+ state_in = *__SIMD32(pState)++;
+
+ /* Read the output state values from the state buffer: y[n-1], y[n-2] */
+ state_out = *__SIMD32(pState)--;
+
+ /* Apply loop unrolling and compute 2 output values simultaneously. */
+ /* The variable acc hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+ sample = blockSize >> 1u;
+
+ /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
+ ** a second loop below computes the remaining 1 sample. */
+ while(sample > 0u)
+ {
+
+ /* Read the input */
+ in = *__SIMD32(pIn)++;
+
+ /* out = b0 * x[n] + 0 * 0 */
+ out = __SMUAD(b0, in);
+
+ /* acc += b1 * x[n-1] + b2 * x[n-2] + out */
+ acc = __SMLALD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + a2 * y[n-2] */
+ acc = __SMLALD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ out = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ out = __SSAT(out, 16);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, (in >> 16), 16);
+ state_out = __PKHBT(state_out >> 16, (out), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* out = b0 * x[n] + 0 * 0 */
+ out = __SMUADX(b0, in);
+ /* acc += b1 * x[n-1] + b2 * x[n-2] + out */
+ acc = __SMLALD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + a2 * y[n-2] */
+ acc = __SMLALD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ out = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ out = __SSAT(out, 16);
+
+ /* Store the output in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ = __PKHBT(state_out, out, 16);
+
+#else
+
+ *__SIMD32(pOut)++ = __PKHBT(out, state_out >> 16, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in >> 16, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, in, 16);
+ state_out = __PKHBT(state_out >> 16, out, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+ /* Decrement the loop counter */
+ sample--;
+
+ }
+
+ /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ if((blockSize & 0x1u) != 0u)
+ {
+ /* Read the input */
+ in = *pIn++;
+
+ /* out = b0 * x[n] + 0 * 0 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out = __SMUAD(b0, in);
+
+#else
+
+ out = __SMUADX(b0, in);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc = b1 * x[n-1] + b2 * x[n-2] + out */
+ acc = __SMLALD(b1, state_in, out);
+ /* acc += a1 * y[n-1] + a2 * y[n-2] */
+ acc = __SMLALD(a1, state_out, acc);
+
+ /* The result is converted from 3.29 to 1.31 if postShift = 1, and then saturation is applied */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ out = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ out = __SSAT(out, 16);
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = (q15_t) out;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ /* x[n-N], x[n-N-1] are packed together to make state_in of type q31 */
+ /* y[n-N], y[n-N-1] are packed together to make state_out of type q31 */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ state_in = __PKHBT(in, state_in, 16);
+ state_out = __PKHBT(out, state_out, 16);
+
+#else
+
+ state_in = __PKHBT(state_in >> 16, in, 16);
+ state_out = __PKHBT(state_out >> 16, out, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ }
+
+ /* The first stage goes from the input wire to the output wire. */
+ /* Subsequent numStages occur in-place in the output wire */
+ pIn = pDst;
+
+ /* Reset the output pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the state array */
+ *__SIMD32(pState)++ = state_in;
+ *__SIMD32(pState)++ = state_out;
+
+
+ /* Decrement the loop counter */
+ stage--;
+
+ } while(stage > 0u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t *pIn = pSrc; /* Source pointer */
+ q15_t *pOut = pDst; /* Destination pointer */
+ q15_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q15_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
+ q15_t Xn; /* temporary input */
+ q63_t acc; /* Accumulator */
+ int32_t shift = (15 - (int32_t) S->postShift); /* Post shift */
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ uint32_t sample, stage = (uint32_t) S->numStages; /* Stage loop counter */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* The variables acc holds the output value that is computed:
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize;
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q31_t) b0 *Xn;
+
+ /* acc += b1 * x[n-1] */
+ acc += (q31_t) b1 *Xn1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q31_t) b2 *Xn2;
+ /* acc += a1 * y[n-1] */
+ acc += (q31_t) a1 *Yn1;
+ /* acc += a2 * y[n-2] */
+ acc += (q31_t) a2 *Yn2;
+
+ /* The result is converted to 1.31 */
+ acc = __SSAT((acc >> shift), 16);
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = (q15_t) acc;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent stages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset to destination pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
new file mode 100644
index 0000000..a2f0afb
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df1_q31.c
@@ -0,0 +1,399 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df1_q31.c
+*
+* Description: Processing function for the
+* Q31 Biquad cascade filter
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup BiquadCascadeDF1
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q31 Biquad cascade filter.
+ * @param[in] *S points to an instance of the Q31 Biquad cascade structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clip.
+ * In order to avoid overflows completely the input signal must be scaled down by 2 bits and lie in the range [-0.25 +0.25).
+ * After all 5 multiply-accumulates are performed, the 2.62 accumulator is shifted by postShift bits and the result truncated to
+ * 1.31 format by discarding the low 32 bits.
+ *
+ * \par
+ * Refer to the function arm_biquad_cascade_df1_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ */
+
+void arm_biquad_cascade_df1_q31(
+ const arm_biquad_casd_df1_inst_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q63_t acc; /* accumulator */
+ uint32_t uShift = ((uint32_t) S->postShift + 1u);
+ uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */
+ q31_t *pIn = pSrc; /* input pointer initialization */
+ q31_t *pOut = pDst; /* output pointer initialization */
+ q31_t *pState = S->pState; /* pState pointer initialization */
+ q31_t *pCoeffs = S->pCoeffs; /* coeff pointer initialization */
+ q31_t Xn1, Xn2, Yn1, Yn2; /* Filter state variables */
+ q31_t b0, b1, b2, a1, a2; /* Filter coefficients */
+ q31_t Xn; /* temporary input */
+ uint32_t sample, stage = S->numStages; /* loop counters */
+
+
+#ifndef ARM_MATH_CM0
+
+ q31_t acc_l, acc_h; /* temporary output variables */
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* Apply loop unrolling and compute 4 output values simultaneously. */
+ /* The variable acc hold output values that are being computed:
+ *
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn2;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn1;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn2;
+
+ /* The result is converted to 1.31 , Yn2 variable is reused */
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = Yn2;
+
+ /* Read the second input */
+ Xn2 = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn2;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn1;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn2;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn1;
+
+
+ /* The result is converted to 1.31, Yn1 variable is reused */
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = Yn1;
+
+ /* Read the third input */
+ Xn1 = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn1;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn2;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn1;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn2;
+
+ /* The result is converted to 1.31, Yn2 variable is reused */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ Yn2 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = Yn2;
+
+ /* Read the forth input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn2;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn2;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn1;
+
+ /* The result is converted to 1.31, Yn1 variable is reused */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ Yn1 = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = Yn1;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ sample = (blockSize & 0x3u);
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn;
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn2;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn1;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn2;
+
+ /* The result is converted to 1.31 */
+ acc = acc >> lShift;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = (q31_t) acc;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent stages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset to destination pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ do
+ {
+ /* Reading the coefficients */
+ b0 = *pCoeffs++;
+ b1 = *pCoeffs++;
+ b2 = *pCoeffs++;
+ a1 = *pCoeffs++;
+ a2 = *pCoeffs++;
+
+ /* Reading the state values */
+ Xn1 = pState[0];
+ Xn2 = pState[1];
+ Yn1 = pState[2];
+ Yn2 = pState[3];
+
+ /* The variables acc holds the output value that is computed:
+ * acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2]
+ */
+
+ sample = blockSize;
+
+ while(sample > 0u)
+ {
+ /* Read the input */
+ Xn = *pIn++;
+
+ /* acc = b0 * x[n] + b1 * x[n-1] + b2 * x[n-2] + a1 * y[n-1] + a2 * y[n-2] */
+ /* acc = b0 * x[n] */
+ acc = (q63_t) b0 *Xn;
+
+ /* acc += b1 * x[n-1] */
+ acc += (q63_t) b1 *Xn1;
+ /* acc += b[2] * x[n-2] */
+ acc += (q63_t) b2 *Xn2;
+ /* acc += a1 * y[n-1] */
+ acc += (q63_t) a1 *Yn1;
+ /* acc += a2 * y[n-2] */
+ acc += (q63_t) a2 *Yn2;
+
+ /* The result is converted to 1.31 */
+ acc = acc >> lShift;
+
+ /* Every time after the output is computed state should be updated. */
+ /* The states should be updated as: */
+ /* Xn2 = Xn1 */
+ /* Xn1 = Xn */
+ /* Yn2 = Yn1 */
+ /* Yn1 = acc */
+ Xn2 = Xn1;
+ Xn1 = Xn;
+ Yn2 = Yn1;
+ Yn1 = (q31_t) acc;
+
+ /* Store the output in the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* decrement the loop counter */
+ sample--;
+ }
+
+ /* The first stage goes from the input buffer to the output buffer. */
+ /* Subsequent stages occur in-place in the output buffer */
+ pIn = pDst;
+
+ /* Reset to destination pointer */
+ pOut = pDst;
+
+ /* Store the updated state variables back into the pState array */
+ *pState++ = Xn1;
+ *pState++ = Xn2;
+ *pState++ = Yn1;
+ *pState++ = Yn2;
+
+ } while(--stage);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+}
+
+/**
+ * @} end of BiquadCascadeDF1 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
new file mode 100644
index 0000000..62db8f3
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_biquad_cascade_df2T_f32.c
@@ -0,0 +1,376 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_biquad_cascade_df2T_f32.c
+*
+* Description: Processing function for the floating-point transposed
+* direct form II Biquad cascade filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup BiquadCascadeDF2T Biquad Cascade IIR Filters Using a Direct Form II Transposed Structure
+ *
+ * This set of functions implements arbitrary order recursive (IIR) filters using a transposed direct form II structure.
+ * The filters are implemented as a cascade of second order Biquad sections.
+ * These functions provide a slight memory savings as compared to the direct form I Biquad filter functions.
+ * Only floating-point data is supported.
+ *
+ * This function operate on blocks of input and output data and each call to the function
+ * processes blockSize samples through the filter.
+ * pSrc points to the array of input data and
+ * pDst points to the array of output data.
+ * Both arrays contain blockSize values.
+ *
+ * \par Algorithm
+ * Each Biquad stage implements a second order filter using the difference equation:
+ *
+ * where d1 and d2 represent the two state values.
+ *
+ * \par
+ * A Biquad filter using a transposed Direct Form II structure is shown below.
+ * \image html BiquadDF2Transposed.gif "Single transposed Direct Form II Biquad"
+ * Coefficients b0, b1, and b2 multiply the input signal x[n] and are referred to as the feedforward coefficients.
+ * Coefficients a1 and a2 multiply the output signal y[n] and are referred to as the feedback coefficients.
+ * Pay careful attention to the sign of the feedback coefficients.
+ * Some design tools flip the sign of the feedback coefficients:
+ *
+ * In this case the feedback coefficients a1 and a2 must be negated when used with the CMSIS DSP Library.
+ *
+ * \par
+ * Higher order filters are realized as a cascade of second order sections.
+ * numStages refers to the number of second order stages used.
+ * For example, an 8th order filter would be realized with numStages=4 second order stages.
+ * A 9th order filter would be realized with numStages=5 second order stages with the
+ * coefficients for one of the stages configured as a first order filter (b2=0 and a2=0).
+ *
+ * \par
+ * pState points to the state variable array.
+ * Each Biquad stage has 2 state variables d1 and d2.
+ * The state variables are arranged in the pState array as:
+ *
+ * {d11, d12, d21, d22, ...}
+ *
+ * where d1x refers to the state variables for the first Biquad and
+ * d2x refers to the state variables for the second Biquad.
+ * The state array has a total length of 2*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ *
+ * \par
+ * The CMSIS library contains Biquad filters in both Direct Form I and transposed Direct Form II.
+ * The advantage of the Direct Form I structure is that it is numerically more robust for fixed-point data types.
+ * That is why the Direct Form I structure supports Q15 and Q31 data types.
+ * The transposed Direct Form II structure, on the other hand, requires a wide dynamic range for the state variables d1 and d2.
+ * Because of this, the CMSIS library only has a floating-point version of the Direct Form II Biquad.
+ * The advantage of the Direct Form II Biquad is that it requires half the number of state variables, 2 rather than 4, per Biquad stage.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ *
+ * \par Init Functions
+ * There is also an associated initialization function.
+ * The initialization function performs following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * For example, to statically initialize the instance structure use
+ *
+ *
+ * \par
+ * where b1x and a1x are the coefficients for the first stage,
+ * b2x and a2x are the coefficients for the second stage,
+ * and so on. The pCoeffs array contains a total of 5*numStages values.
+ *
+ * \par
+ * The pState is a pointer to state array.
+ * Each Biquad stage has 2 state variables d1, and d2.
+ * The 2 state variables for stage 1 are first, then the 2 state variables for stage 2, and so on.
+ * The state array has a total length of 2*numStages values.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ */
+
+void arm_biquad_cascade_df2T_init_f32(
+ arm_biquad_cascade_df2T_instance_f32 * S,
+ uint8_t numStages,
+ float32_t * pCoeffs,
+ float32_t * pState)
+{
+ /* Assign filter stages */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always 2 * numStages */
+ memset(pState, 0, (2u * (uint32_t) numStages) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+}
+
+/**
+ * @} end of BiquadCascadeDF2T group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c
new file mode 100644
index 0000000..0e91fc8
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_f32.c
@@ -0,0 +1,645 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_f32.c
+*
+* Description: Convolution of floating-point sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup Conv Convolution
+ *
+ * Convolution is a mathematical operation that operates on two finite length vectors to generate a finite length output vector.
+ * Convolution is similar to correlation and is frequently used in filtering and data analysis.
+ * The CMSIS DSP library contains functions for convolving Q7, Q15, Q31, and floating-point data types.
+ * The library also provides fast versions of the Q15 and Q31 functions on Cortex-M4 and Cortex-M3.
+ *
+ * \par Algorithm
+ * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
+ * Then the convolution
+ *
+ *
+ * c[n] = a[n] * b[n]
+ *
+ *
+ * \par
+ * is defined as
+ * \image html ConvolutionEquation.gif
+ * \par
+ * Note that c[n] is of length srcALen + srcBLen - 1 and is defined over the interval n=0, 1, 2, ..., srcALen + srcBLen - 2.
+ * pSrcA points to the first input vector of length srcALen and
+ * pSrcB points to the second input vector of length srcBLen.
+ * The output result is written to pDst and the calling function must allocate srcALen+srcBLen-1 words for the result.
+ *
+ * \par
+ * Conceptually, when two signals a[n] and b[n] are convolved,
+ * the signal b[n] slides over a[n].
+ * For each offset \c n, the overlapping portions of a[n] and b[n] are multiplied and summed together.
+ *
+ * \par
+ * Note that convolution is a commutative operation:
+ *
+ *
+ * a[n] * b[n] = b[n] * a[n].
+ *
+ *
+ * \par
+ * This means that switching the A and B arguments to the convolution functions has no effect.
+ *
+ * Fixed-Point Behavior
+ *
+ * \par
+ * Convolution requires summing up a large number of intermediate products.
+ * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ * Refer to the function specific documentation below for further details of the particular algorithm used.
+ *
+ *
+ * Fast Versions
+ *
+ * \par
+ * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of conv and the design requires
+ * the input signals should be scaled down to avoid intermediate overflows.
+ *
+ *
+ * Opt Versions
+ *
+ * \par
+ * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of floating-point sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ */
+
+void arm_conv_f32(
+ float32_t * pSrcA,
+ uint32_t srcALen,
+ float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t *pIn1; /* inputA pointer */
+ float32_t *pIn2; /* inputB pointer */
+ float32_t *pOut = pDst; /* output pointer */
+ float32_t *px; /* Intermediate inputA pointer */
+ float32_t *py; /* Intermediate inputB pointer */
+ float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counters */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += x0 * c0;
+
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += x2 * c0;
+
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 += x3 * c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py--);
+
+ /* Read x[4] sample */
+ x0 = *(px + 1u);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 += x0 * c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px + 2u);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 += x1 * c0;
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *(py--);
+
+ /* Read x[6] sample */
+ x2 = *(px + 3u);
+ px += 4u;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 += x2 * c0;
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc0;
+ *pOut++ = acc1;
+ *pOut++ = acc2;
+ *pOut++ = acc3;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t *pIn1 = pSrcA; /* inputA pointer */
+ float32_t *pIn2 = pSrcB; /* inputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counters */
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0u; i < ((srcALen + srcBLen) - 1u); i++)
+ {
+ /* Initialize sum with zero to carry out MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += pIn1[j] * pIn2[i - j];
+ }
+ }
+ /* Store the output in the destination buffer */
+ pDst[i] = sum;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
new file mode 100644
index 0000000..a1ccf8a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_opt_q15.c
@@ -0,0 +1,537 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_fast_opt_q15.c
+*
+* Description: Fast Q15 Convolution.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ * @return none.
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results
+ * but provides only a single guard bit. There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+ *
+ * \par
+ * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ */
+
+void arm_conv_fast_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ uint32_t tapCnt; /* loop count */
+#ifdef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t a, b;
+
+#endif /* #ifdef UNALIGNED_SUPPORT_DISABLE */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1u) zeros in scratch1 buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ /* First part of the processing with loop unrolling process 4 data points at a time.
+ ** a second loop below process for the remaining 1 to 3 samples. */
+
+ /* Actual convolution process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = _SIMD32_OFFSET(pScr1);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x2, y2, acc0);
+ acc2 = __SMLAD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr1 + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+#else
+
+ /* Read four samples from smaller buffer */
+ a = *pIn2;
+ b = *(pIn2 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ y1 = __PKHBT(a, b, 16);
+#else
+ y1 = __PKHBT(b, a, 16);
+#endif
+
+ a = *(pIn2 + 2);
+ b = *(pIn2 + 3);
+#ifndef ARM_MATH_BIG_ENDIAN
+ y2 = __PKHBT(a, b, 16);
+#else
+ y2 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ acc2 = __SMLAD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ a = *pScr1;
+ b = *(pScr1 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(a, b, 16);
+#else
+ x1 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLAD(x2, y2, acc0);
+
+ acc2 = __SMLAD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ a = *(pScr1 + 2);
+ b = *(pScr1 + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x2 = __PKHBT(a, b, 16);
+#else
+ x2 = __PKHBT(b, a, 16);
+#endif
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* update scratch pointers */
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c
new file mode 100644
index 0000000..1bc120e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q15.c
@@ -0,0 +1,1404 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_fast_q15.c
+*
+* Description: Fast Q15 Convolution.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results
+ * but provides only a single guard bit. There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+ *
+ * \par
+ * See arm_conv_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ */
+
+void arm_conv_fast_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+#ifndef UNALIGNED_SUPPORT_DISABLE
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + (count - 1u);
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px+1);
+ px+= 2u;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px+1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px+2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px+3);
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ /* Read y[srcBLen - 7] */
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px+2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ = __PKHBT((acc0 >> 15), (acc1 >> 15), 16);
+ *__SIMD32(pOut)++ = __PKHBT((acc2 >> 15), (acc3 >> 15), 16);
+
+#else
+
+ *__SIMD32(pOut)++ = __PKHBT((acc1 >> 15), (acc0 >> 15), 16);
+ *__SIMD32(pOut)++ = __PKHBT((acc3 >> 15), (acc2 >> 15), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = blockSize3 >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
+ q15_t a, b;
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ py++;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + (count - 1u);
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ a = *px++;
+ b = *px++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *px;
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *px;
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ a = *py;
+ b = *(py+1);
+ py -= 2;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x2 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(b, a, 16);
+
+#else
+
+ x2 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ a = *py;
+ b = *(py+1);
+ py -= 2;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5], x[6] */
+ a = *(px + 2);
+ b = *(px + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ a = *px;
+ b = *(px+1);
+ px++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ a = *py;
+ b = *(py+1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ a = *py;
+ b = *(py+1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ /* Read y[srcBLen - 7] */
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ a = *(px+2);
+ b = *(px+3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = (q15_t)(acc0 >> 15);
+ *pOut++ = (q15_t)(acc1 >> 15);
+ *pOut++ = (q15_t)(acc2 >> 15);
+ *pOut++ = (q15_t)(acc3 >> 15);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = blockSize3 >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ py++;
+
+ while(k > 0u)
+ {
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c
new file mode 100644
index 0000000..d48609f
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_fast_q31.c
@@ -0,0 +1,571 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_fast_q31.c
+*
+* Description: Q31 Convolution (fast version).
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ * These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ * Finally, the accumulator is saturated and converted to a 1.31 result.
+ *
+ * \par
+ * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signals must be scaled down.
+ * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ *
+ * \par
+ * See arm_conv_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ */
+
+void arm_conv_fast_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py--);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[srcBLen - 3] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[srcBLen - 3] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[srcBLen - 3] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *(py--);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 << 1);
+ *pOut++ = (q31_t) (acc1 << 1);
+ *pOut++ = (q31_t) (acc2 << 1);
+ *pOut++ = (q31_t) (acc3 << 1);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c
new file mode 100644
index 0000000..a8563d0
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q15.c
@@ -0,0 +1,543 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_opt_q15.c
+*
+* Description: Convolution of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ * @return none.
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * This approach provides 33 guard bits and there is no risk of overflow.
+ * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ *
+ *
+ * \par
+ * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ *
+ */
+
+void arm_conv_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q63_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ uint32_t tapCnt; /* loop count */
+#ifdef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t a, b;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif
+
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ /* First part of the processing with loop unrolling process 4 data points at a time.
+ ** a second loop below process for the remaining 1 to 3 samples. */
+
+ /* Actual convolution process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ /* multiply and accumlate */
+ acc0 = __SMLALD(x1, y1, acc0);
+ acc2 = __SMLALD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = _SIMD32_OFFSET(pScr1);
+
+ /* multiply and accumlate */
+ acc0 = __SMLALD(x2, y2, acc0);
+ acc2 = __SMLALD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr1 + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+#else
+
+ /* Read four samples from smaller buffer */
+ a = *pIn2;
+ b = *(pIn2 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ y1 = __PKHBT(a, b, 16);
+#else
+ y1 = __PKHBT(b, a, 16);
+#endif
+
+ a = *(pIn2 + 2);
+ b = *(pIn2 + 3);
+#ifndef ARM_MATH_BIG_ENDIAN
+ y2 = __PKHBT(a, b, 16);
+#else
+ y2 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLALD(x1, y1, acc0);
+
+ acc2 = __SMLALD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ a = *pScr1;
+ b = *(pScr1 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(a, b, 16);
+#else
+ x1 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLALD(x2, y2, acc0);
+
+ acc2 = __SMLALD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ a = *(pScr1 + 2);
+ b = *(pScr1 + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x2 = __PKHBT(a, b, 16);
+#else
+ x2 = __PKHBT(b, a, 16);
+#endif
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ acc0 += (*pScr1++ * *pIn2++);
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+}
+
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c
new file mode 100644
index 0000000..5b6d4cc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_opt_q7.c
@@ -0,0 +1,433 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_opt_q7.c
+*
+* Description: Convolution of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ * @return none.
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ *
+ */
+
+void arm_conv_opt_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
+ q15_t x4; /* Temporary input variable */
+ q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* loop counter */
+ q7_t *px; /* Temporary input1 pointer */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ q7_t *pOut = pDst; /* output pointer */
+ q7_t out0, out1, out2, out3; /* temporary variables */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2;
+
+ /* points to smaller length sequence */
+ px = pIn2 + srcBLen - 1;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = (q7_t *) py;
+
+ pScr2 = py;
+
+ /* Actual convolution process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2 + 2u);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = *__SIMD32(pScr1)++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ pScr2 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
+ out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
+ out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
+ out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
+
+ *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+ acc0 += (*pScr1++ * *pScr2++);
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+}
+
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c
new file mode 100644
index 0000000..0f7a4dc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_f32.c
@@ -0,0 +1,660 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_f32.c
+*
+* Description: Partial convolution of floating-point sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup PartialConv Partial Convolution
+ *
+ * Partial Convolution is equivalent to Convolution except that a subset of the output samples is generated.
+ * Each function has two additional arguments.
+ * firstIndex specifies the starting index of the subset of output samples.
+ * numPoints is the number of output samples to compute.
+ * The function computes the output in the range
+ * [firstIndex, ..., firstIndex+numPoints-1].
+ * The output array pDst contains numPoints values.
+ *
+ * The allowable range of output indices is [0 srcALen+srcBLen-2].
+ * If the requested subset does not fall in this range then the functions return ARM_MATH_ARGUMENT_ERROR.
+ * Otherwise the functions return ARM_MATH_SUCCESS.
+ * \note Refer arm_conv_f32() for details on fixed point behavior.
+ *
+ *
+ * Fast Versions
+ *
+ * \par
+ * Fast versions are supported for Q31 and Q15 of partial convolution. Cycles for Fast versions are less compared to Q31 and Q15 of partial conv and the design requires
+ * the input signals should be scaled down to avoid intermediate overflows.
+ *
+ *
+ * Opt Versions
+ *
+ * \par
+ * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of partial convolution
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of floating-point sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ */
+
+arm_status arm_conv_partial_f32(
+ float32_t * pSrcA,
+ uint32_t srcALen,
+ float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t *pIn1 = pSrcA; /* inputA pointer */
+ float32_t *pIn2 = pSrcB; /* inputB pointer */
+ float32_t *pOut = pDst; /* output pointer */
+ float32_t *px; /* Intermediate inputA pointer */
+ float32_t *py; /* Intermediate inputB pointer */
+ float32_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ float32_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ float32_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t j, k, count = 0u, blkCnt, check;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = (int32_t) check - (int32_t) srcALen;
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = ((int32_t) srcBLen - 1) - (int32_t) firstIndex;
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = ((int32_t) check - blockSize3) -
+ (blockSize1 + (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + firstIndex;
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* x[1] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* x[2] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* x[3] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc1;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = ((uint32_t) blockSize2 >> 2u);
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += x0 * c0;
+
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += x2 * c0;
+
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 += x3 * c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py--);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 += x0 * c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 += x1 * c0;
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *(py--);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 += x2 * c0;
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = acc0;
+ *pOut++ = acc1;
+ *pOut++ = acc2;
+ *pOut++ = acc3;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += *px++ * *py--;
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += *px++ * *py--;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t *pIn1 = pSrcA; /* inputA pointer */
+ float32_t *pIn2 = pSrcB; /* inputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations for inputs */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += pIn1[j] * pIn2[i - j];
+ }
+ }
+ /* Store the output in the destination buffer */
+ pDst[i] = sum;
+ }
+ /* set status as ARM_SUCCESS as there are no argument errors */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
new file mode 100644
index 0000000..3bdd8f1
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_opt_q15.c
@@ -0,0 +1,762 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_fast_opt_q15.c
+*
+* Description: Fast Q15 Partial convolution.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+arm_status arm_conv_partial_fast_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ arm_status status;
+
+ uint32_t tapCnt; /* loop count */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+
+ /* Copy smaller length input sequence in reverse order into second scratch buffer */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Assuming scratch1 buffer is aligned by 32-bit */
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* First part of the processing with loop unrolling process 4 data points at a time.
+ ** a second loop below process for the remaining 1 to 3 samples. */
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = _SIMD32_OFFSET(pScr1);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x2, y2, acc0);
+
+ acc2 = __SMLAD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr1 + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+ /* update scratch pointers */
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = numPoints & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read two samples from smaller buffer */
+ y1 = *__SIMD32(pIn2)++;
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+
+#else
+
+arm_status arm_conv_partial_fast_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ arm_status status; /* Status variable */
+ uint32_t tapCnt; /* loop count */
+ q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */
+ q15_t y10, y11; /* Temporary variables to hold srcB buffer */
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *pScr1++;
+ x21 = *pScr1++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read two samples from smaller buffer */
+ y10 = *pIn2;
+ y11 = *(pIn2 + 1u);
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x10 *y10;
+ acc0 += (q31_t) x11 *y11;
+ acc2 += (q31_t) x20 *y10;
+ acc2 += (q31_t) x21 *y11;
+
+ /* multiply and accumlate */
+ acc1 += (q31_t) x11 *y10;
+ acc1 += (q31_t) x20 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1;
+ x11 = *(pScr1 + 1u);
+
+ /* multiply and accumlate */
+ acc3 += (q31_t) x21 *y10;
+ acc3 += (q31_t) x10 *y11;
+
+ /* Read next two samples from scratch2 buffer */
+ y10 = *(pIn2 + 2u);
+ y11 = *(pIn2 + 3u);
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x20 *y10;
+ acc0 += (q31_t) x21 *y11;
+ acc2 += (q31_t) x10 *y10;
+ acc2 += (q31_t) x11 *y11;
+ acc1 += (q31_t) x21 *y10;
+ acc1 += (q31_t) x10 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *(pScr1 + 2);
+ x21 = *(pScr1 + 3);
+
+ /* multiply and accumlate */
+ acc3 += (q31_t) x11 *y10;
+ acc3 += (q31_t) x20 *y11;
+
+ /* update scratch pointers */
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = __SSAT((acc0 >> 15), 16);
+ *pOut++ = __SSAT((acc1 >> 15), 16);
+ *pOut++ = __SSAT((acc2 >> 15), 16);
+ *pOut++ = __SSAT((acc3 >> 15), 16);
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = numPoints & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read two samples from smaller buffer */
+ y10 = *pIn2++;
+ y11 = *pIn2++;
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x10 *y10;
+ acc0 += (q31_t) x11 *y11;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
new file mode 100644
index 0000000..734bb6e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q15.c
@@ -0,0 +1,1472 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_fast_q15.c
+*
+* Description: Fast Q15 Partial convolution.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * See arm_conv_partial_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+
+arm_status arm_conv_partial_fast_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0;
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >=srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2 - 1u;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = ((uint32_t) blockSize2 >> 2u);
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px+1);
+ px+= 2u;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px+1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px+2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px+3);
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px+2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ = __PKHBT(acc0 >> 15, acc1 >> 15, 16);
+ *__SIMD32(pOut)++ = __PKHBT(acc2 >> 15, acc3 >> 15, 16);
+
+#else
+
+ *__SIMD32(pOut)++ = __PKHBT(acc1 >> 15, acc0 >> 15, 16);
+ *__SIMD32(pOut)++ = __PKHBT(acc3 >> 15, acc2 >> 15, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = count >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLADX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLAD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0;
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+ q15_t a, b;
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >=srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ py++;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2 - 1u;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = ((uint32_t) blockSize2 >> 2u);
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ a = *px++;
+ b = *px++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *px;
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *px;
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ a = *py;
+ b = *(py+1);
+ py -= 2;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLADX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x2 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(b, a, 16);
+
+#else
+
+ x2 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLADX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLADX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ a = *py;
+ b = *(py+1);
+ py -= 2;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLADX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLADX(x3, c0, acc1);
+
+ /* Read x[4], x[5], x[6] */
+ a = *(px + 2);
+ b = *(px + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLADX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ a = *px;
+ b = *(px+1);
+ px++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ a = *py;
+ b = *(py+1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ a = *py;
+ b = *(py+1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x0, c0, acc0);
+ acc1 = __SMLADX(x1, c0, acc1);
+ acc2 = __SMLADX(x3, c0, acc2);
+ acc3 = __SMLADX(x2, c0, acc3);
+
+ /* Read y[srcBLen - 7] */
+ c0 = *(py-1);
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ a = *(px+2);
+ b = *(px+3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = (q15_t)(acc0 >> 15);
+ *pOut++ = (q15_t)(acc1 >> 15);
+ *pOut++ = (q15_t)(acc2 >> 15);
+ *pOut++ = (q15_t)(acc3 >> 15);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = count >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ py++;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ sum += ((q31_t) * px++ * *py--);
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (sum >> 15);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
new file mode 100644
index 0000000..68ce6bd
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_fast_q31.c
@@ -0,0 +1,598 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_fast_q31.c
+*
+* Description: Fast Q31 Partial convolution.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * \par
+ * See arm_conv_partial_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ */
+
+arm_status arm_conv_partial_fast_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0;
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[1] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[2] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* x[3] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2 */
+ blkCnt = ((uint32_t) blockSize2 >> 2u);
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+ /* acc3 += x[3] * y[srcBLen - 1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py--);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[srcBLen - 2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read y[srcBLen - 4] sample */
+ c0 = *(py--);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[srcBLen - 4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[srcBLen - 4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[srcBLen - 4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[srcBLen - 4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 << 1);
+ *pOut++ = (q31_t) (acc1 << 1);
+ *pOut++ = (q31_t) (acc2 << 1);
+ *pOut++ = (q31_t) (acc3 << 1);
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py--))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = sum << 1;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
new file mode 100644
index 0000000..9f3a8f9
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q15.c
@@ -0,0 +1,763 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_opt_q15.c
+*
+* Description: Partial convolution of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @param[in] *pScratch1 points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer of size min(srcALen, srcBLen).
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, state buffers should be aligned by 32-bit
+ *
+ * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ *
+ */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+arm_status arm_conv_partial_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q63_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3; /* Temporary variables to hold state and coefficient values */
+ q31_t y1, y2; /* State variables */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ arm_status status; /* Status variable */
+ uint32_t tapCnt; /* loop count */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr1, srcALen);
+
+ /* Update pointers */
+ pScr1 += srcALen;
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ /* multiply and accumlate */
+ acc0 = __SMLALD(x1, y1, acc0);
+ acc2 = __SMLALD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = _SIMD32_OFFSET(pScr1);
+
+ /* multiply and accumlate */
+ acc0 = __SMLALD(x2, y2, acc0);
+ acc2 = __SMLALD(x1, y2, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr1 + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+ /* update scratch pointers */
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = numPoints & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read two samples from smaller buffer */
+ y1 = *__SIMD32(pIn2)++;
+
+ acc0 = __SMLALD(x1, y1, acc0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#else
+
+arm_status arm_conv_partial_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch1 */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch1 */
+ q63_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ uint32_t j, k, blkCnt; /* loop counter */
+ arm_status status; /* Status variable */
+ uint32_t tapCnt; /* loop count */
+ q15_t x10, x11, x20, x21; /* Temporary variables to hold srcA buffer */
+ q15_t y10, y11; /* Temporary variables to hold srcB buffer */
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2 + srcBLen - 1;
+
+ /* points to smaller length sequence */
+ px = pIn2;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr2-- = *px++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy bigger length sequence(srcALen) samples in scratch1 buffer */
+
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = py;
+
+ pScratch1 += firstIndex;
+
+ pOut = pDst + firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *pScr1++;
+ x21 = *pScr1++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read two samples from smaller buffer */
+ y10 = *pIn2;
+ y11 = *(pIn2 + 1u);
+
+ /* multiply and accumlate */
+ acc0 += (q63_t) x10 *y10;
+ acc0 += (q63_t) x11 *y11;
+ acc2 += (q63_t) x20 *y10;
+ acc2 += (q63_t) x21 *y11;
+
+ /* multiply and accumlate */
+ acc1 += (q63_t) x11 *y10;
+ acc1 += (q63_t) x20 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1;
+ x11 = *(pScr1 + 1u);
+
+ /* multiply and accumlate */
+ acc3 += (q63_t) x21 *y10;
+ acc3 += (q63_t) x10 *y11;
+
+ /* Read next two samples from scratch2 buffer */
+ y10 = *(pIn2 + 2u);
+ y11 = *(pIn2 + 3u);
+
+ /* multiply and accumlate */
+ acc0 += (q63_t) x20 *y10;
+ acc0 += (q63_t) x21 *y11;
+ acc2 += (q63_t) x10 *y10;
+ acc2 += (q63_t) x11 *y11;
+ acc1 += (q63_t) x21 *y10;
+ acc1 += (q63_t) x10 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *(pScr1 + 2);
+ x21 = *(pScr1 + 3);
+
+ /* multiply and accumlate */
+ acc3 += (q63_t) x11 *y10;
+ acc3 += (q63_t) x20 *y11;
+
+ /* update scratch pointers */
+ pIn2 += 4u;
+ pScr1 += 4u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2);
+ acc1 += (*pScr1++ * *pIn2);
+ acc2 += (*pScr1++ * *pIn2);
+ acc3 += (*pScr1++ * *pIn2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = __SSAT((acc0 >> 15), 16);
+ *pOut++ = __SSAT((acc1 >> 15), 16);
+ *pOut++ = __SSAT((acc2 >> 15), 16);
+ *pOut++ = __SSAT((acc3 >> 15), 16);
+
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = numPoints & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read two samples from smaller buffer */
+ y10 = *pIn2++;
+ y11 = *pIn2++;
+
+ /* multiply and accumlate */
+ acc0 += (q63_t) x10 *y10;
+ acc0 += (q63_t) x11 *y11;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
new file mode 100644
index 0000000..bbd7e51
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_opt_q7.c
@@ -0,0 +1,805 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_opt_q7.c
+*
+* Description: Partial convolution of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ *
+ *
+ */
+
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+arm_status arm_conv_partial_opt_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
+ q15_t x4; /* Temporary input variable */
+ q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* loop counter */
+ q7_t *px; /* Temporary input1 pointer */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+ arm_status status;
+ q7_t *pOut = pDst; /* output pointer */
+ q7_t out0, out1, out2, out3; /* temporary variables */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2;
+
+ /* points to smaller length sequence */
+ px = pIn2 + srcBLen - 1;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = (q7_t *) py;
+
+ pScr2 = py;
+
+ pOut = pDst + firstIndex;
+
+ pScratch1 += firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2 + 2u);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = *__SIMD32(pScr1)++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ pScr2 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
+ out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
+ out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
+ out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
+
+ *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+ blkCnt = (numPoints) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read two samples from smaller buffer */
+ y1 = *__SIMD32(pScr2)++;
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+
+ }
+
+ return (status);
+
+}
+
+#else
+
+arm_status arm_conv_partial_opt_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+
+ q15_t *pScr2, *pScr1; /* Intermediate pointers for scratch pointers */
+ q15_t x4; /* Temporary input variable */
+ q7_t *pIn1, *pIn2; /* inputA and inputB pointer */
+ uint32_t j, k, blkCnt, tapCnt; /* loop counter */
+ q7_t *px; /* Temporary input1 pointer */
+ q15_t *py; /* Temporary input2 pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulator */
+ arm_status status;
+ q7_t *pOut = pDst; /* output pointer */
+ q15_t x10, x11, x20, x21; /* Temporary input variables */
+ q15_t y10, y11; /* Temporary input variables */
+ q7_t out0, out1, out2, out3; /* temporary variables */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* pointer to take end of scratch2 buffer */
+ pScr2 = pScratch2;
+
+ /* points to smaller length sequence */
+ px = pIn2 + srcBLen - 1;
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * px--;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Initialze temporary scratch pointer */
+ pScr1 = pScratch1;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Temporary pointer for scratch2 */
+ py = pScratch2;
+
+ /* Initialization of pIn2 pointer */
+ pIn2 = (q7_t *) py;
+
+ pScr2 = py;
+
+ pOut = pDst + firstIndex;
+
+ pScratch1 += firstIndex;
+
+ /* Actual convolution process starts here */
+ blkCnt = (numPoints) >> 2;
+
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *pScr1++;
+ x21 = *pScr1++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y10 = *pScr2;
+ y11 = *(pScr2 + 1u);
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x10 *y10;
+ acc0 += (q31_t) x11 *y11;
+ acc2 += (q31_t) x20 *y10;
+ acc2 += (q31_t) x21 *y11;
+
+
+ acc1 += (q31_t) x11 *y10;
+ acc1 += (q31_t) x20 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1;
+ x11 = *(pScr1 + 1u);
+
+ /* multiply and accumlate */
+ acc3 += (q31_t) x21 *y10;
+ acc3 += (q31_t) x10 *y11;
+
+ /* Read next two samples from scratch2 buffer */
+ y10 = *(pScr2 + 2u);
+ y11 = *(pScr2 + 3u);
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x20 *y10;
+ acc0 += (q31_t) x21 *y11;
+ acc2 += (q31_t) x10 *y10;
+ acc2 += (q31_t) x11 *y11;
+ acc1 += (q31_t) x21 *y10;
+ acc1 += (q31_t) x10 *y11;
+
+ /* Read next two samples from scratch1 buffer */
+ x20 = *(pScr1 + 2);
+ x21 = *(pScr1 + 3);
+
+ /* multiply and accumlate */
+ acc3 += (q31_t) x11 *y10;
+ acc3 += (q31_t) x20 *y11;
+
+ /* update scratch pointers */
+
+ pScr1 += 4u;
+ pScr2 += 4u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ out0 = (q7_t) (__SSAT(acc0 >> 7u, 8));
+ out1 = (q7_t) (__SSAT(acc1 >> 7u, 8));
+ out2 = (q7_t) (__SSAT(acc2 >> 7u, 8));
+ out3 = (q7_t) (__SSAT(acc3 >> 7u, 8));
+
+
+ *__SIMD32(pOut)++ = __PACKq7(out0, out1, out2, out3);
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+ blkCnt = (numPoints) & 0x3;
+
+ /* Calculate convolution for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read next two samples from scratch1 buffer */
+ x10 = *pScr1++;
+ x11 = *pScr1++;
+
+ /* Read two samples from smaller buffer */
+ y10 = *pScr2++;
+ y11 = *pScr2++;
+
+ /* multiply and accumlate */
+ acc0 += (q31_t) x10 *y10;
+ acc0 += (q31_t) x11 *y11;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ return (status);
+
+}
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c
new file mode 100644
index 0000000..3c97c2e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q15.c
@@ -0,0 +1,777 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_q15.c
+*
+* Description: Partial convolution of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * Refer to arm_conv_partial_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ * \par
+ * Refer the function arm_conv_partial_opt_q15() for a faster implementation of this function using scratch buffers.
+ *
+ */
+
+
+arm_status arm_conv_partial_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary input variables */
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2 - 1u;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px+1);
+ px+= 2u;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLALDX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px+1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLALDX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLALDX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLALDX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLALDX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px+2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px+3);
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLALDX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+
+ c0 = *(py-1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px+2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD(x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = count >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t *pIn1 = pSrcA; /* inputA pointer */
+ q15_t *pIn2 = pSrcB; /* inputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q31_t) pIn1[j] * (pIn2[i - j]));
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
+ }
+ /* set status as ARM_SUCCESS as there are no argument errors */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+
+#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
+
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c
new file mode 100644
index 0000000..c334892
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q31.c
@@ -0,0 +1,598 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_q31.c
+*
+* Description: Partial convolution of Q31 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q31 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * See arm_conv_partial_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ */
+
+arm_status arm_conv_partial_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum, acc0, acc1, acc2; /* Accumulator */
+ q31_t x0, x1, x2, c0;
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
+ arm_status status; /* status of Partial convolution */
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blkCnt */
+
+ blkCnt = blockSize2 / 3;
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py);
+
+ /* Read x[2] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += (q63_t) x0 *c0;
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += (q63_t) x1 *c0;
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += (q63_t) x2 *c0;
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py - 1u);
+
+ /* Read x[3] sample */
+ x0 = *(px + 1u);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += (q63_t) x1 *c0;
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += (q63_t) x2 *c0;
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += (q63_t) x0 *c0;
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py - 2u);
+
+ /* Read x[4] sample */
+ x1 = *(px + 2u);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += (q63_t) x2 *c0;
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += (q63_t) x0 *c0;
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += (q63_t) x1 *c0;
+
+
+ px += 3u;
+
+ py -= 3u;
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += (q63_t) x0 *c0;
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += (q63_t) x1 *c0;
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += (q63_t) x2 *c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 >> 31);
+ *pOut++ = (q31_t) (acc1 >> 31);
+ *pOut++ = (q31_t) (acc2 >> 31);
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t *pIn1 = pSrcA; /* inputA pointer */
+ q31_t *pIn2 = pSrcB; /* inputB pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q31_t) (sum >> 31u);
+ }
+ /* set status as ARM_SUCCESS as there are no argument errors */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c
new file mode 100644
index 0000000..a2939c7
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_partial_q7.c
@@ -0,0 +1,732 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_partial_q7.c
+*
+* Description: Partial convolution of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup PartialConv
+ * @{
+ */
+
+/**
+ * @brief Partial convolution of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written.
+ * @param[in] firstIndex is the first output sample to start with.
+ * @param[in] numPoints is the number of output points to be computed.
+ * @return Returns either ARM_MATH_SUCCESS if the function completed correctly or ARM_MATH_ARGUMENT_ERROR if the requested subset is not in the range [0 srcALen+srcBLen-2].
+ *
+ * \par
+ * Refer the function arm_conv_partial_opt_q7() for a faster implementation of this function.
+ *
+ */
+
+arm_status arm_conv_partial_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ uint32_t firstIndex,
+ uint32_t numPoints)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t *pIn1; /* inputA pointer */
+ q7_t *pIn2; /* inputB pointer */
+ q7_t *pOut = pDst; /* output pointer */
+ q7_t *px; /* Intermediate inputA pointer */
+ q7_t *py; /* Intermediate inputB pointer */
+ q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t input1, input2;
+ q15_t in1, in2;
+ q7_t x0, x1, x2, x3, c0, c1;
+ uint32_t j, k, count, check, blkCnt;
+ int32_t blockSize1, blockSize2, blockSize3; /* loop counter */
+ arm_status status;
+
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_MATH_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* Conditions to check which loopCounter holds
+ * the first and last indices of the output samples to be calculated. */
+ check = firstIndex + numPoints;
+ blockSize3 = ((int32_t) check - (int32_t) srcALen);
+ blockSize3 = (blockSize3 > 0) ? blockSize3 : 0;
+ blockSize1 = (((int32_t) srcBLen - 1) - (int32_t) firstIndex);
+ blockSize1 = (blockSize1 > 0) ? ((check > (srcBLen - 1u)) ? blockSize1 :
+ (int32_t) numPoints) : 0;
+ blockSize2 = (int32_t) check - ((blockSize3 + blockSize1) +
+ (int32_t) firstIndex);
+ blockSize2 = (blockSize2 > 0) ? blockSize2 : 0;
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* Set the output pointer to point to the firstIndex
+ * of the output sample to be calculated. */
+ pOut = pDst + firstIndex;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed.
+ Since the partial convolution starts from from firstIndex
+ Number of Macs to be performed is firstIndex + 1 */
+ count = 1u + firstIndex;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + firstIndex;
+ py = pSrc2;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 1] , y[srcBLen - 2] */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[0] * y[srcBLen - 1] */
+ /* x[1] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 3] , y[srcBLen - 4] */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[2] * y[srcBLen - 3] */
+ /* x[3] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = ++pSrc2;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = ((uint32_t) blockSize2 >> 2u);
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+ /* Read y[srcBLen - 2] sample */
+ c1 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+ /* Read y[srcBLen - 4] sample */
+ c1 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q31_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q31_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q31_t) x2 * c0);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += ((q31_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc1 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc2 >> 7, 8));
+ *pOut++ = (q7_t) (__SSAT(acc3 >> 7, 8));
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = (uint32_t) blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum += ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q7_t *pIn1 = pSrcA; /* inputA pointer */
+ q7_t *pIn2 = pSrcB; /* inputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counters */
+ arm_status status; /* status of Partial convolution */
+
+ /* Check for range of output samples to be calculated */
+ if((firstIndex + numPoints) > ((srcALen + (srcBLen - 1u))))
+ {
+ /* Set status as ARM_ARGUMENT_ERROR */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Loop to calculate convolution for output length number of values */
+ for (i = firstIndex; i <= (firstIndex + numPoints - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q15_t) pIn1[j] * (pIn2[i - j]));
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
+ }
+ /* set status as ARM_SUCCESS as there are no argument errors */
+ status = ARM_MATH_SUCCESS;
+ }
+ return (status);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of PartialConv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c
new file mode 100644
index 0000000..bbe5a8f
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q15.c
@@ -0,0 +1,732 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_q15.c
+*
+* Description: Convolution of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * This approach provides 33 guard bits and there is no risk of overflow.
+ * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ *
+ * \par
+ * Refer to arm_conv_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ * \par
+ * Refer the function arm_conv_opt_q15() for a faster implementation of this function using scratch buffers.
+ *
+ */
+
+void arm_conv_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+
+#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t blockSize1, blockSize2, blockSize3, j, k, count, blkCnt; /* loop counter */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations less than 4 */
+ /* Second part of this stage computes the MAC operations greater than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ while((count < 4u) && (blockSize1 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over number of MAC operations between
+ * inputA samples and inputB samples */
+ k = count;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* The second part of the stage starts here */
+ /* The internal loop, over count, is unrolled by 4 */
+ /* To, read the last two inputB samples using SIMD:
+ * y[srcBLen] and y[srcBLen-1] coefficients, py is decremented by 1 */
+ py = py - 1;
+
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0], x[1] are multiplied with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[2], x[3] are multiplied with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + (count - 1u);
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is the index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+
+ /* --------------------
+ * Stage2 process
+ * -------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ py = py - 1u;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px+1);
+ px+= 2u;
+
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the last two inputB samples using SIMD:
+ * y[srcBLen - 1] and y[srcBLen - 2] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLALDX(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px+1);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLALDX(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLALDX(x3, c0, acc3);
+
+ /* Read y[srcBLen - 3] and y[srcBLen - 4] */
+ c0 = *__SIMD32(py)--;
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLALDX(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLALDX(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px+2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px+3);
+ px += 4u;
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLALDX(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[srcBLen - 5] */
+ c0 = *(py+1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[srcBLen - 5], y[srcBLen - 6] */
+ c0 = _SIMD32_OFFSET(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px+1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x0, c0, acc0);
+ acc1 = __SMLALDX(x1, c0, acc1);
+ acc2 = __SMLALDX(x3, c0, acc2);
+ acc3 = __SMLALDX(x2, c0, acc3);
+
+ c0 = *(py-1);
+
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px+2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD(x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+ *__SIMD32(pOut)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) ((q31_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT(sum >> 15, 16));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ blockSize3 = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ pIn2 = pSrc2 - 1u;
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ /* For loop unrolling by 4, this stage is divided into two. */
+ /* First part of this stage computes the MAC operations greater than 4 */
+ /* Second part of this stage computes the MAC operations less than or equal to 4 */
+
+ /* The first part of the stage starts here */
+ j = blockSize3 >> 2u;
+
+ while((j > 0u) && (blockSize3 > 0u))
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[srcALen - srcBLen + 1], x[srcALen - srcBLen + 2] are multiplied
+ * with y[srcBLen - 1], y[srcBLen - 2] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+ /* x[srcALen - srcBLen + 3], x[srcALen - srcBLen + 4] are multiplied
+ * with y[srcBLen - 3], y[srcBLen - 4] respectively */
+ sum = __SMLALDX(*__SIMD32(px)++, *__SIMD32(py)--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* For the next MAC operations, the pointer py is used without SIMD
+ * So, py is incremented by 1 */
+ py = py + 1u;
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 5] * y[srcBLen - 5] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+
+ j--;
+ }
+
+ /* The second part of the stage starts here */
+ /* SIMD is not used for the next MAC operations,
+ * so pointer py is updated to read only one sample at a time */
+ py = py + 1u;
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen-1] * y[srcBLen-1] */
+ sum = __SMLALD(*px++, *py--, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ q15_t *pIn1 = pSrcA; /* input pointer */
+ q15_t *pIn2 = pSrcB; /* coefficient pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counter */
+
+ /* Loop to calculate output of convolution for output length number of times */
+ for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += (q31_t) pIn1[j] * (pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q15_t) __SSAT((sum >> 15u), 16u);
+ }
+
+#endif /* #if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)*/
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c
new file mode 100644
index 0000000..986d5f8
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q31.c
@@ -0,0 +1,563 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_q31.c
+*
+* Description: Convolution of Q31 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q31 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down the inputs by log2(min(srcALen, srcBLen)) (log2 is read as log to the base 2) times to avoid overflows,
+ * as maximum of min(srcALen, srcBLen) number of additions are carried internally.
+ * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ *
+ * \par
+ * See arm_conv_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ */
+
+void arm_conv_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q63_t sum; /* Accumulator */
+ q63_t acc0, acc1, acc2; /* Accumulator */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (q31_t *) pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = (q31_t *) pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[1] * y[srcBLen - 2] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[2] * y[srcBLen - 3] */
+ sum += (q63_t) * px++ * (*py--);
+ /* x[3] * y[srcBLen - 4] */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll by 3 */
+ blkCnt = blockSize2 / 3;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py);
+
+ /* Read x[3] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[0] * y[srcBLen - 1] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[1] * y[srcBLen - 1] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[2] * y[srcBLen - 1] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read y[srcBLen - 2] sample */
+ c0 = *(py - 1u);
+
+ /* Read x[4] sample */
+ x0 = *(px + 1u);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[srcBLen - 2] */
+ acc0 += ((q63_t) x1 * c0);
+ /* acc1 += x[2] * y[srcBLen - 2] */
+ acc1 += ((q63_t) x2 * c0);
+ /* acc2 += x[3] * y[srcBLen - 2] */
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py - 2u);
+
+ /* Read x[5] sample */
+ x1 = *(px + 2u);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[srcBLen - 3] */
+ acc0 += ((q63_t) x2 * c0);
+ /* acc1 += x[3] * y[srcBLen - 2] */
+ acc1 += ((q63_t) x0 * c0);
+ /* acc2 += x[4] * y[srcBLen - 2] */
+ acc2 += ((q63_t) x1 * c0);
+
+ /* update scratch pointers */
+ px += 3u;
+ py -= 3u;
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut++ = (q31_t) (acc0 >> 31);
+ *pOut++ = (q31_t) (acc1 >> 31);
+ *pOut++ = (q31_t) (acc2 >> 31);
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ sum += (q63_t) * px++ * (*py--);
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q31_t) (sum >> 31);
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t *pIn1 = pSrcA; /* input pointer */
+ q31_t *pIn2 = pSrcB; /* coefficient pointer */
+ q63_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counter */
+
+ /* Loop to calculate output of convolution for output length number of times */
+ for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * (pIn2[i - j]));
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q31_t) (sum >> 31u);
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c
new file mode 100644
index 0000000..f7f6ebc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_conv_q7.c
@@ -0,0 +1,688 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_conv_q7.c
+*
+* Description: Convolution of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Conv
+ * @{
+ */
+
+/**
+ * @brief Convolution of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length srcALen+srcBLen-1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and then saturated to 1.7 format.
+ *
+ * \par
+ * Refer the function arm_conv_opt_q7() for a faster implementation of this function.
+ *
+ */
+
+void arm_conv_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t *pIn1; /* inputA pointer */
+ q7_t *pIn2; /* inputB pointer */
+ q7_t *pOut = pDst; /* output pointer */
+ q7_t *px; /* Intermediate inputA pointer */
+ q7_t *py; /* Intermediate inputB pointer */
+ q7_t *pSrc1, *pSrc2; /* Intermediate pointers */
+ q7_t x0, x1, x2, x3, c0, c1; /* Temporary variables to hold state and coefficient values */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulator */
+ q31_t input1, input2; /* Temporary input variables */
+ q15_t in1, in2; /* Temporary input variables */
+ uint32_t j, k, count, blkCnt, blockSize1, blockSize2, blockSize3; /* loop counter */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+ }
+
+ /* conv(x,y) at n = x[n] * y[0] + x[n-1] * y[1] + x[n-2] * y[2] + ...+ x[n-N+1] * y[N -1] */
+ /* The function is internally
+ * divided into three stages according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first stage of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second stage of the algorithm, srcBLen number of multiplications are done.
+ * In the third stage of the algorithm, the multiplications decrease by one
+ * for every iteration. */
+
+ /* The algorithm is implemented in three stages.
+ The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = (srcALen - srcBLen) + 1u;
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[0]
+ * sum = x[0] * y[1] + x[1] * y[0]
+ * ....
+ * sum = x[0] * y[srcBlen - 1] + x[1] * y[srcBlen - 2] +...+ x[srcBLen - 1] * y[0]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* y[srcBLen - 1] , y[srcBLen - 2] */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* x[0] * y[srcBLen - 1] */
+ /* x[1] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* y[srcBLen - 3] , y[srcBLen - 4] */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* x[2] * y[srcBLen - 3] */
+ /* x[3] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q15_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pIn2 + count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[srcBLen-1] + x[1] * y[srcBLen-2] +...+ x[srcBLen-1] * y[0]
+ * sum = x[1] * y[srcBLen-1] + x[2] * y[srcBLen-2] +...+ x[srcBLen] * y[0]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[srcBLen-1] + x[srcALen] * y[srcBLen-2] +...+ x[srcALen-1] * y[0]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[srcBLen - 1] sample */
+ c0 = *(py--);
+ /* Read y[srcBLen - 2] sample */
+ c1 = *(py--);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* y[srcBLen - 1] and y[srcBLen - 2] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc0 += x[0] * y[srcBLen - 1] + x[1] * y[srcBLen - 2] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc1 += x[1] * y[srcBLen - 1] + x[2] * y[srcBLen - 2] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc2 += x[2] * y[srcBLen - 1] + x[3] * y[srcBLen - 2] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc3 += x[3] * y[srcBLen - 1] + x[4] * y[srcBLen - 2] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[srcBLen - 3] sample */
+ c0 = *(py--);
+ /* Read y[srcBLen - 4] sample */
+ c1 = *(py--);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* y[srcBLen - 3] and y[srcBLen - 4] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc0 += x[2] * y[srcBLen - 3] + x[3] * y[srcBLen - 4] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc1 += x[3] * y[srcBLen - 3] + x[4] * y[srcBLen - 4] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc2 += x[4] * y[srcBLen - 3] + x[5] * y[srcBLen - 4] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* acc3 += x[5] * y[srcBLen - 3] + x[6] * y[srcBLen - 4] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[srcBLen - 5] sample */
+ c0 = *(py--);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[srcBLen - 5] */
+ acc0 += ((q15_t) x0 * c0);
+ /* acc1 += x[5] * y[srcBLen - 5] */
+ acc1 += ((q15_t) x1 * c0);
+ /* acc2 += x[6] * y[srcBLen - 5] */
+ acc2 += ((q15_t) x2 * c0);
+ /* acc3 += x[7] * y[srcBLen - 5] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(acc0 >> 7u, 8));
+ *pOut++ = (q7_t) (__SSAT(acc1 >> 7u, 8));
+ *pOut++ = (q7_t) (__SSAT(acc2 >> 7u, 8));
+ *pOut++ = (q7_t) (__SSAT(acc3 >> 7u, 8));
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q15_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* srcBLen number of MACS should be performed */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[srcBLen-1] + x[srcALen-srcBLen+2] * y[srcBLen-2] +...+ x[srcALen-1] * y[1]
+ * sum += x[srcALen-srcBLen+2] * y[srcBLen-1] + x[srcALen-srcBLen+3] * y[srcBLen-2] +...+ x[srcALen-1] * y[2]
+ * ....
+ * sum += x[srcALen-2] * y[srcBLen-1] + x[srcALen-1] * y[srcBLen-2]
+ * sum += x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The blockSize3 variable holds the number of MAC operations performed */
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ pSrc2 = pIn2 + (srcBLen - 1u);
+ py = pSrc2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = blockSize3 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Reading two inputs, x[srcALen - srcBLen + 1] and x[srcALen - srcBLen + 2] of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Reading two inputs, y[srcBLen - 1] and y[srcBLen - 2] of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[srcBLen - 1] */
+ /* sum += x[srcALen - srcBLen + 2] * y[srcBLen - 2] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs, x[srcALen - srcBLen + 3] and x[srcALen - srcBLen + 4] of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* Reading two inputs, y[srcBLen - 3] and y[srcBLen - 4] of SrcB buffer and packing */
+ in1 = (q15_t) * py--;
+ in2 = (q15_t) * py--;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16u);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[srcBLen - 3] */
+ /* sum += x[srcALen - srcBLen + 4] * y[srcBLen - 4] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the blockSize3 is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = blockSize3 % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q15_t) * px++ * *py--);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut++ = (q7_t) (__SSAT(sum >> 7u, 8));
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pSrc2;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q7_t *pIn1 = pSrcA; /* input pointer */
+ q7_t *pIn2 = pSrcB; /* coefficient pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i, j; /* loop counter */
+
+ /* Loop to calculate output of convolution for output length number of times */
+ for (i = 0; i < (srcALen + srcBLen - 1); i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if(((i - j) < srcBLen) && (j < srcALen))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += (q15_t) pIn1[j] * (pIn2[i - j]);
+ }
+ }
+
+ /* Store the output in the destination buffer */
+ pDst[i] = (q7_t) __SSAT((sum >> 7u), 8u);
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Conv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c
new file mode 100644
index 0000000..c04784e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_f32.c
@@ -0,0 +1,737 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_f32.c
+*
+* Description: Correlation of floating-point sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup Corr Correlation
+ *
+ * Correlation is a mathematical operation that is similar to convolution.
+ * As with convolution, correlation uses two signals to produce a third signal.
+ * The underlying algorithms in correlation and convolution are identical except that one of the inputs is flipped in convolution.
+ * Correlation is commonly used to measure the similarity between two signals.
+ * It has applications in pattern recognition, cryptanalysis, and searching.
+ * The CMSIS library provides correlation functions for Q7, Q15, Q31 and floating-point data types.
+ * Fast versions of the Q15 and Q31 functions are also provided.
+ *
+ * \par Algorithm
+ * Let a[n] and b[n] be sequences of length srcALen and srcBLen samples respectively.
+ * The convolution of the two signals is denoted by
+ *
+ * c[n] = a[n] * b[n]
+ *
+ * In correlation, one of the signals is flipped in time
+ *
+ * c[n] = a[n] * b[-n]
+ *
+ *
+ * \par
+ * and this is mathematically defined as
+ * \image html CorrelateEquation.gif
+ * \par
+ * The pSrcA points to the first input vector of length srcALen and pSrcB points to the second input vector of length srcBLen.
+ * The result c[n] is of length 2 * max(srcALen, srcBLen) - 1 and is defined over the interval n=0, 1, 2, ..., (2 * max(srcALen, srcBLen) - 2).
+ * The output result is written to pDst and the calling function must allocate 2 * max(srcALen, srcBLen) - 1 words for the result.
+ *
+ * Note
+ * \par
+ * The pDst should be initialized to all zeros before being used.
+ *
+ * Fixed-Point Behavior
+ * \par
+ * Correlation requires summing up a large number of intermediate products.
+ * As such, the Q7, Q15, and Q31 functions run a risk of overflow and saturation.
+ * Refer to the function specific documentation below for further details of the particular algorithm used.
+ *
+ *
+ * Fast Versions
+ *
+ * \par
+ * Fast versions are supported for Q31 and Q15. Cycles for Fast versions are less compared to Q31 and Q15 of correlate and the design requires
+ * the input signals should be scaled down to avoid intermediate overflows.
+ *
+ *
+ * Opt Versions
+ *
+ * \par
+ * Opt versions are supported for Q15 and Q7. Design uses internal scratch buffer for getting good optimisation.
+ * These versions are optimised in cycles and consumes more memory(Scratch memory) compared to Q15 and Q7 versions of correlate
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+/**
+ * @brief Correlation of floating-point sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ */
+
+void arm_correlate_f32(
+ float32_t * pSrcA,
+ uint32_t srcALen,
+ float32_t * pSrcB,
+ uint32_t srcBLen,
+ float32_t * pDst)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t *pIn1; /* inputA pointer */
+ float32_t *pIn2; /* inputB pointer */
+ float32_t *pOut = pDst; /* output pointer */
+ float32_t *px; /* Intermediate inputA pointer */
+ float32_t *py; /* Intermediate inputB pointer */
+ float32_t *pSrc1; /* Intermediate pointers */
+ float32_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ float32_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counters */
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcA;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcB;
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding has to be done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ //while(j > 0u)
+ //{
+ // /* Zero is stored in the destination buffer */
+ // *pOut++ = 0.0f;
+
+ // /* Decrement the loop counter */
+ // j--;
+ //}
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization of inputB pointer */
+ pIn2 = pSrcA;
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen-2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum += *px++ * *py++;
+ /* x[1] * y[srcBLen - 3] */
+ sum += *px++ * *py++;
+ /* x[2] * y[srcBLen - 2] */
+ sum += *px++ * *py++;
+ /* x[3] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ /* x[0] * y[srcBLen - 1] */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py++);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 += x0 * c0;
+ /* acc1 += x[1] * y[0] */
+ acc1 += x1 * c0;
+ /* acc2 += x[2] * y[0] */
+ acc2 += x2 * c0;
+ /* acc3 += x[3] * y[0] */
+ acc3 += x3 * c0;
+
+ /* Read y[1] sample */
+ c0 = *(py++);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[1] * y[1] */
+ acc0 += x1 * c0;
+ /* acc1 += x[2] * y[1] */
+ acc1 += x2 * c0;
+ /* acc2 += x[3] * y[1] */
+ acc2 += x3 * c0;
+ /* acc3 += x[4] * y[1] */
+ acc3 += x0 * c0;
+
+ /* Read y[2] sample */
+ c0 = *(py++);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[2] */
+ acc0 += x2 * c0;
+ /* acc1 += x[3] * y[2] */
+ acc1 += x3 * c0;
+ /* acc2 += x[4] * y[2] */
+ acc2 += x0 * c0;
+ /* acc3 += x[5] * y[2] */
+ acc3 += x1 * c0;
+
+ /* Read y[3] sample */
+ c0 = *(py++);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[3] */
+ acc0 += x3 * c0;
+ /* acc1 += x[4] * y[3] */
+ acc1 += x0 * c0;
+ /* acc2 += x[5] * y[3] */
+ acc2 += x1 * c0;
+ /* acc3 += x[6] * y[3] */
+ acc3 += x2 * c0;
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 += x0 * c0;
+ /* acc1 += x[5] * y[4] */
+ acc1 += x1 * c0;
+ /* acc2 += x[6] * y[4] */
+ acc2 += x2 * c0;
+ /* acc3 += x[7] * y[4] */
+ acc3 += x3 * c0;
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = acc0;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = acc1;
+ pOut += inc;
+
+ *pOut = acc2;
+ pOut += inc;
+
+ *pOut = acc3;
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0.0f;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += *px++ * *py++;
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += *px++ * *py++;
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += *px++ * *py++;
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += *px++ * *py++;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t *pIn1 = pSrcA; /* inputA pointer */
+ float32_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
+ float32_t sum; /* Accumulator */
+ uint32_t i = 0u, j; /* loop counters */
+ uint32_t inv = 0u; /* Reverse order flag */
+ uint32_t tot = 0u; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2u);
+
+ if(srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if(srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1u);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0u; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0.0f;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += pIn1[j] * pIn2[-((int32_t) i - j)];
+ }
+ }
+ /* Store the output in the destination buffer */
+ if(inv == 1)
+ *pDst-- = sum;
+ else
+ *pDst++ = sum;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
new file mode 100644
index 0000000..5666c3f
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_opt_q15.c
@@ -0,0 +1,506 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_fast_opt_q15.c
+*
+* Description: Fast Q15 Correlation.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @return none.
+ *
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch buffers should be aligned by 32-bit
+ *
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ * maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+ *
+ * \par
+ * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+void arm_correlate_fast_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
+{
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *py; /* Intermediate inputB pointer */
+ q31_t x1, x2, x3; /* temporary variables for holding input and coefficient values */
+ uint32_t j, blkCnt, outBlockSize; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+ uint32_t tapCnt;
+ q31_t y1, y2;
+ q15_t *pScr; /* Intermediate pointers */
+ q15_t *pOut = pDst; /* output pointer */
+#ifdef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t a, b;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ pScr = pScratch;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr += (srcBLen - 1u);
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr, srcALen);
+
+ /* Update pointers */
+ pScr += srcALen;
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ j = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ j = srcALen % 0x4u;
+
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr += (srcBLen - 1u);
+
+#else
+
+/* Apply loop unrolling and do 4 Copies simultaneously. */
+ j = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr++ = 0;
+ *pScr++ = 0;
+ *pScr++ = 0;
+ *pScr++ = 0;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ j = (srcBLen - 1u) % 0x4u;
+
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr++ = 0;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Temporary pointer for scratch2 */
+ py = pIn2;
+
+
+ /* Actual correlation process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read four samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr)++;
+
+ /* Read next four samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ acc2 = __SMLAD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x1 = _SIMD32_OFFSET(pScr);
+
+ acc0 = __SMLAD(x2, y2, acc0);
+
+ acc2 = __SMLAD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+#else
+
+ /* Read four samples from smaller buffer */
+ a = *pIn2;
+ b = *(pIn2 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ y1 = __PKHBT(a, b, 16);
+#else
+ y1 = __PKHBT(b, a, 16);
+#endif
+
+ a = *(pIn2 + 2);
+ b = *(pIn2 + 3);
+#ifndef ARM_MATH_BIG_ENDIAN
+ y2 = __PKHBT(a, b, 16);
+#else
+ y2 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLAD(x1, y1, acc0);
+
+ acc2 = __SMLAD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ a = *pScr;
+ b = *(pScr + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(a, b, 16);
+#else
+ x1 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLAD(x2, y2, acc0);
+
+ acc2 = __SMLAD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ acc1 = __SMLADX(x3, y2, acc1);
+
+ a = *(pScr + 2);
+ b = *(pScr + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x2 = __PKHBT(a, b, 16);
+#else
+ x2 = __PKHBT(b, a, 16);
+#endif
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y2, acc3);
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ pIn2 += 4u;
+
+ pScr += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr++ * *pIn2);
+ acc1 += (*pScr++ * *pIn2);
+ acc2 += (*pScr++ * *pIn2);
+ acc3 += (*pScr++ * *pIn2++);
+
+ pScr -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut = (__SSAT(acc0 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc1 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc2 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc3 >> 15u, 16));
+ pOut += inc;
+
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ acc0 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+
+ *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 1u;
+
+ }
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c
new file mode 100644
index 0000000..d50c66c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q15.c
@@ -0,0 +1,1313 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_fast_q15.c
+*
+* Description: Fast Q15 Correlation.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q15 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down one of the inputs by 1/min(srcALen, srcBLen) to avoid overflow since a
+ * maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ * The 2.30 accumulator is right shifted by 15 bits and then saturated to 1.15 format to yield the final result.
+ *
+ * \par
+ * See arm_correlate_q15() for a slower implementation of this function which uses a 64-bit accumulator to avoid wrap around distortion.
+ */
+
+void arm_correlate_fast_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
+ sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
+ sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum = __SMLAD(*px++, *py++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px + 1);
+ px += 2u;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the first two inputB samples using SIMD:
+ * y[0] and y[1] */
+ c0 = *__SIMD32(py)++;
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px + 1);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLAD(x3, c0, acc3);
+
+ /* Read y[2] and y[3] */
+ c0 = *__SIMD32(py)++;
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLAD(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLAD(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px + 3);
+ px += 4u;
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLAD(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLAD(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[4] */
+ c0 = *py;
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[4], y[5] */
+ c0 = *__SIMD32(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px + 1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[4], y[5] */
+ c0 = *__SIMD32(py)++;
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px + 1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+
+ c0 = (*py);
+ /* Read y[6] */
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px + 2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (acc0 >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q15_t) (acc1 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc2 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc3 >> 15);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = __SMLAD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(*px++, *py++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+ q15_t a, b;
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 2u;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the first two inputB samples using SIMD:
+ * y[0] and y[1] */
+ a = *py;
+ b = *(py + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Read x[2], x[3], x[4] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x2 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(b, a, 16);
+
+#else
+
+ x2 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x3 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLAD(x3, c0, acc3);
+
+ /* Read y[2] and y[3] */
+ a = *(py + 2);
+ b = *(py + 3);
+
+ py += 4u;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLAD(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLAD(x3, c0, acc1);
+
+ /* Read x[4], x[5], x[6] */
+ a = *(px + 2);
+ b = *(px + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x0 = __PKHBT(a, b, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(b, a, 16);
+
+#else
+
+ x0 = __PKHBT(b, a, 16);
+ a = *(px + 4);
+ x1 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 4u;
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLAD(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLAD(x1, c0, acc3);
+
+ } while(--k);
+
+ /* For the next MAC operations, SIMD is not used
+ * So, the 16 bit pointer if inputB, py is updated */
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[4] */
+ c0 = *py;
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7] */
+ a = *px;
+ b = *(px + 1);
+
+ px++;;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLADX(x1, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[4], y[5] */
+ a = *py;
+ b = *(py + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[4], y[5] */
+ a = *py;
+ b = *(py + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ c0 = __PKHBT(a, b, 16);
+
+#else
+
+ c0 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ py += 2u;
+
+ /* Read x[7], x[8], x[9] */
+ a = *px;
+ b = *(px + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(b, a, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+ a = *(px + 2);
+ x2 = __PKHBT(a, b, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+ acc2 = __SMLAD(x3, c0, acc2);
+ acc3 = __SMLAD(x2, c0, acc3);
+
+ c0 = (*py);
+ /* Read y[6] */
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+
+ /* Read x[10] */
+ b = *(px + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ x3 = __PKHBT(a, b, 16);
+
+#else
+
+ x3 = __PKHBT(b, a, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLADX(x1, c0, acc0);
+ acc1 = __SMLAD(x2, c0, acc1);
+ acc2 = __SMLADX(x2, c0, acc2);
+ acc3 = __SMLADX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (acc0 >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q15_t) (acc1 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc2 >> 15);
+ pOut += inc;
+
+ *pOut = (q15_t) (acc3 >> 15);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q31_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (sum >> 15);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c
new file mode 100644
index 0000000..4afee53
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_fast_q31.c
@@ -0,0 +1,606 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_fast_q31.c
+*
+* Description: Fast Q31 Correlation.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q31 sequences (fast version) for Cortex-M3 and Cortex-M4.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ * These intermediate results are accumulated in a 32-bit register in 2.30 format.
+ * Finally, the accumulator is saturated and converted to a 1.31 result.
+ *
+ * \par
+ * The fast version has the same overflow behavior as the standard version but provides less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signals must be scaled down.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ * maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ *
+ * \par
+ * See arm_correlate_q31() for a slower implementation of this function which uses 64-bit accumulation to provide higher precision.
+ */
+
+void arm_correlate_fast_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* x[1] * y[srcBLen - 3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* x[2] * y[srcBLen - 2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* x[3] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py++);
+
+ /* Read x[3] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[1] * y[0] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[2] * y[0] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[3] * y[0] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read y[1] sample */
+ c0 = *(py++);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[1] * y[1] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc1 += x[2] * y[1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc2 += x[3] * y[1] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc3 += x[4] * y[1] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read y[2] sample */
+ c0 = *(py++);
+
+ /* Read x[5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[2] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc1 += x[3] * y[2] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc2 += x[4] * y[2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc3 += x[5] * y[2] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read y[3] sample */
+ c0 = *(py++);
+
+ /* Read x[6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[3] * y[3] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ /* acc1 += x[4] * y[3] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc2 += x[5] * y[3] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc3 += x[6] * y[3] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+
+ /* Read x[7] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ /* acc1 += x[5] * y[4] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ /* acc2 += x[6] * y[4] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ /* acc3 += x[7] * y[4] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (acc0 << 1);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q31_t) (acc1 << 1);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc2 << 1);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc3 << 1);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 4 */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = ((pIn1 + srcALen) - srcBLen) + 1u;
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * px++ * (*py++))) >> 32);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = sum << 1;
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c
new file mode 100644
index 0000000..87a0b1d
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q15.c
@@ -0,0 +1,511 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_opt_q15.c
+*
+* Description: Correlation of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @param[in] *pScratch points to scratch buffer of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @return none.
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch buffers should be aligned by 32-bit
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * This approach provides 33 guard bits and there is no risk of overflow.
+ * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ *
+ * \par
+ * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ *
+ */
+
+
+void arm_correlate_opt_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst,
+ q15_t * pScratch)
+{
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q63_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *py; /* Intermediate inputB pointer */
+ q31_t x1, x2, x3; /* temporary variables for holding input1 and input2 values */
+ uint32_t j, blkCnt, outBlockSize; /* loop counter */
+ int32_t inc = 1; /* output pointer increment */
+ uint32_t tapCnt;
+ q31_t y1, y2;
+ q15_t *pScr; /* Intermediate pointers */
+ q15_t *pOut = pDst; /* output pointer */
+#ifdef UNALIGNED_SUPPORT_DISABLE
+
+ q15_t a, b;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ pScr = pScratch;
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr += (srcBLen - 1u);
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Copy (srcALen) samples in scratch buffer */
+ arm_copy_q15(pIn1, pScr, srcALen);
+
+ /* Update pointers */
+ //pIn1 += srcALen;
+ pScr += srcALen;
+
+#else
+
+ /* Apply loop unrolling and do 4 Copies simultaneously. */
+ j = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+ *pScr++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ j = srcALen % 0x4u;
+
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr++ = *pIn1++;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr += (srcBLen - 1u);
+
+#else
+
+/* Apply loop unrolling and do 4 Copies simultaneously. */
+ j = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr++ = 0;
+ *pScr++ = 0;
+ *pScr++ = 0;
+ *pScr++ = 0;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ j = (srcBLen - 1u) % 0x4u;
+
+ while(j > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr++ = 0;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Temporary pointer for scratch2 */
+ py = pIn2;
+
+
+ /* Actual correlation process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read four samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr)++;
+
+ /* Read next four samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pIn2);
+ y2 = _SIMD32_OFFSET(pIn2 + 2u);
+
+ acc0 = __SMLALD(x1, y1, acc0);
+
+ acc2 = __SMLALD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ x1 = _SIMD32_OFFSET(pScr);
+
+ acc0 = __SMLALD(x2, y2, acc0);
+
+ acc2 = __SMLALD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ x2 = _SIMD32_OFFSET(pScr + 2u);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+#else
+
+ /* Read four samples from smaller buffer */
+ a = *pIn2;
+ b = *(pIn2 + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ y1 = __PKHBT(a, b, 16);
+#else
+ y1 = __PKHBT(b, a, 16);
+#endif
+
+ a = *(pIn2 + 2);
+ b = *(pIn2 + 3);
+#ifndef ARM_MATH_BIG_ENDIAN
+ y2 = __PKHBT(a, b, 16);
+#else
+ y2 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLALD(x1, y1, acc0);
+
+ acc2 = __SMLALD(x2, y1, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc1 = __SMLALDX(x3, y1, acc1);
+
+ a = *pScr;
+ b = *(pScr + 1);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(a, b, 16);
+#else
+ x1 = __PKHBT(b, a, 16);
+#endif
+
+ acc0 = __SMLALD(x2, y2, acc0);
+
+ acc2 = __SMLALD(x1, y2, acc2);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y1, acc3);
+
+ acc1 = __SMLALDX(x3, y2, acc1);
+
+ a = *(pScr + 2);
+ b = *(pScr + 3);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x2 = __PKHBT(a, b, 16);
+#else
+ x2 = __PKHBT(b, a, 16);
+#endif
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLALDX(x3, y2, acc3);
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ pIn2 += 4u;
+
+ pScr += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr++ * *pIn2);
+ acc1 += (*pScr++ * *pIn2);
+ acc2 += (*pScr++ * *pIn2);
+ acc3 += (*pScr++ * *pIn2++);
+
+ pScr -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+
+ /* Store the results in the accumulators in the destination buffer. */
+ *pOut = (__SSAT(acc0 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc1 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc2 >> 15u, 16));
+ pOut += inc;
+ *pOut = (__SSAT(acc3 >> 15u, 16));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr = pScratch;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ acc0 += (*pScr++ * *pIn2++);
+ acc0 += (*pScr++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr++ * *pIn2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pIn2 = py;
+
+ pScratch += 1u;
+
+ }
+
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c
new file mode 100644
index 0000000..204212e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_opt_q7.c
@@ -0,0 +1,462 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_opt_q7.c
+*
+* Description: Correlation of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @param[in] *pScratch1 points to scratch buffer(of type q15_t) of size max(srcALen, srcBLen) + 2*min(srcALen, srcBLen) - 2.
+ * @param[in] *pScratch2 points to scratch buffer (of type q15_t) of size min(srcALen, srcBLen).
+ * @return none.
+ *
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, scratch1 and scratch2 buffers should be aligned by 32-bit
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
+ *
+ *
+ */
+
+
+
+void arm_correlate_opt_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst,
+ q15_t * pScratch1,
+ q15_t * pScratch2)
+{
+ q7_t *pOut = pDst; /* output pointer */
+ q15_t *pScr1 = pScratch1; /* Temporary pointer for scratch */
+ q15_t *pScr2 = pScratch2; /* Temporary pointer for scratch */
+ q7_t *pIn1; /* inputA pointer */
+ q7_t *pIn2; /* inputB pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ uint32_t j, k = 0u, blkCnt; /* loop counter */
+ int32_t inc = 1; /* output pointer increment */
+ uint32_t outBlockSize; /* loop counter */
+ q15_t x4; /* Temporary input variable */
+ uint32_t tapCnt; /* loop counter */
+ q31_t x1, x2, x3, y1; /* Temporary input variables */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+
+ /* Copy (srcBLen) samples in scratch buffer */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) * pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) * pIn2++;
+ *pScr2++ = x4;
+ x4 = (q15_t) * pIn2++;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn2++;
+ *pScr2++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Fill (srcBLen - 1u) zeros in scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update temporary scratch pointer */
+ pScr1 += (srcBLen - 1u);
+
+ /* Copy (srcALen) samples in scratch buffer */
+ k = srcALen >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = srcALen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ x4 = (q15_t) * pIn1++;
+ *pScr1++ = x4;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Fill (srcBLen - 1u) zeros at end of scratch buffer */
+ arm_fill_q15(0, pScr1, (srcBLen - 1u));
+
+ /* Update pointer */
+ pScr1 += (srcBLen - 1u);
+
+#else
+
+/* Apply loop unrolling and do 4 Copies simultaneously. */
+ k = (srcBLen - 1u) >> 2u;
+
+ /* First part of the processing with loop unrolling copies 4 data points at a time.
+ ** a second loop below copies for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner */
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, copy remaining samples here.
+ ** No loop unrolling is used. */
+ k = (srcBLen - 1u) % 0x4u;
+
+ while(k > 0u)
+ {
+ /* copy second buffer in reversal manner for remaining samples */
+ *pScr1++ = 0;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Temporary pointer for second sequence */
+ py = pScratch2;
+
+ /* Initialization of pScr2 pointer */
+ pScr2 = pScratch2;
+
+ /* Actual correlation process starts here */
+ blkCnt = (srcALen + srcBLen - 1u) >> 2;
+
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Read two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* Read next two samples from scratch1 buffer */
+ x2 = *__SIMD32(pScr1)++;
+
+ tapCnt = (srcBLen) >> 2u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2);
+
+ /* multiply and accumlate */
+ acc0 = __SMLAD(x1, y1, acc0);
+ acc2 = __SMLAD(x2, y1, acc2);
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ /* multiply and accumlate */
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ /* Read next two samples from scratch1 buffer */
+ x1 = *__SIMD32(pScr1)++;
+
+ /* pack input data */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x1, x2, 0);
+#else
+ x3 = __PKHBT(x2, x1, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ /* Read four samples from smaller buffer */
+ y1 = _SIMD32_OFFSET(pScr2 + 2u);
+
+ acc0 = __SMLAD(x2, y1, acc0);
+
+ acc2 = __SMLAD(x1, y1, acc2);
+
+ acc1 = __SMLADX(x3, y1, acc1);
+
+ x2 = *__SIMD32(pScr1)++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+ x3 = __PKHBT(x2, x1, 0);
+#else
+ x3 = __PKHBT(x1, x2, 0);
+#endif
+
+ acc3 = __SMLADX(x3, y1, acc3);
+
+ pScr2 += 4u;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+
+ /* Update scratch pointer for remaining samples of smaller length sequence */
+ pScr1 -= 4u;
+
+
+ /* apply same above for remaining samples of smaller length sequence */
+ tapCnt = (srcBLen) & 3u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2);
+ acc1 += (*pScr1++ * *pScr2);
+ acc2 += (*pScr1++ * *pScr2);
+ acc3 += (*pScr1++ * *pScr2++);
+
+ pScr1 -= 3u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc1 >> 7u, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc2 >> 7u, 8));
+ pOut += inc;
+ *pOut = (q7_t) (__SSAT(acc3 >> 7u, 8));
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 4u;
+
+ }
+
+
+ blkCnt = (srcALen + srcBLen - 1u) & 0x3;
+
+ /* Calculate correlation for remaining samples of Bigger length sequence */
+ while(blkCnt > 0)
+ {
+ /* Initialze temporary scratch pointer as scratch1 */
+ pScr1 = pScratch1;
+
+ /* Clear Accumlators */
+ acc0 = 0;
+
+ tapCnt = (srcBLen) >> 1u;
+
+ while(tapCnt > 0u)
+ {
+ acc0 += (*pScr1++ * *pScr2++);
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (srcBLen) & 1u;
+
+ /* apply same above for remaining samples of smaller length sequence */
+ while(tapCnt > 0u)
+ {
+
+ /* accumlate the results */
+ acc0 += (*pScr1++ * *pScr2++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ blkCnt--;
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7u, 8));
+
+ pOut += inc;
+
+ /* Initialization of inputB pointer */
+ pScr2 = py;
+
+ pScratch1 += 1u;
+
+ }
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c
new file mode 100644
index 0000000..f4df49c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q15.c
@@ -0,0 +1,717 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_q15.c
+*
+* Description: Correlation of Q15 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q15 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both inputs are in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * This approach provides 33 guard bits and there is no risk of overflow.
+ * The 34.30 result is then truncated to 34.15 format by discarding the low 15 bits and then saturated to 1.15 format.
+ *
+ * \par
+ * Refer to arm_correlate_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ * \par
+ * Refer the function arm_correlate_opt_q15() for a faster implementation of this function using scratch buffers.
+ *
+ */
+
+void arm_correlate_q15(
+ q15_t * pSrcA,
+ uint32_t srcALen,
+ q15_t * pSrcB,
+ uint32_t srcBLen,
+ q15_t * pDst)
+{
+
+#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE)
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q15_t *pIn1; /* inputA pointer */
+ q15_t *pIn2; /* inputB pointer */
+ q15_t *pOut = pDst; /* output pointer */
+ q63_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *px; /* Intermediate inputA pointer */
+ q15_t *py; /* Intermediate inputB pointer */
+ q15_t *pSrc1; /* Intermediate pointers */
+ q31_t x0, x1, x2, x3, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first loop starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] , x[1] * y[srcBLen - 3] */
+ sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ /* x[3] * y[srcBLen - 1] , x[2] * y[srcBLen - 2] */
+ sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum = __SMLALD(*px++, *py++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((sum >> 15), 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4, to loop unroll the srcBLen loop */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *__SIMD32(px);
+ /* read x[1], x[2] samples */
+ x1 = _SIMD32_OFFSET(px + 1);
+ px += 2u;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read the first two inputB samples using SIMD:
+ * y[0] and y[1] */
+ c0 = *__SIMD32(py)++;
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Read x[2], x[3] */
+ x2 = *__SIMD32(px);
+
+ /* Read x[3], x[4] */
+ x3 = _SIMD32_OFFSET(px + 1);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLALD(x3, c0, acc3);
+
+ /* Read y[2] and y[3] */
+ c0 = *__SIMD32(py)++;
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLALD(x2, c0, acc0);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLALD(x3, c0, acc1);
+
+ /* Read x[4], x[5] */
+ x0 = _SIMD32_OFFSET(px + 2);
+
+ /* Read x[5], x[6] */
+ x1 = _SIMD32_OFFSET(px + 3);
+
+ px += 4u;
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLALD(x0, c0, acc2);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLALD(x1, c0, acc3);
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ if(k == 1u)
+ {
+ /* Read y[4] */
+ c0 = *py;
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+
+#else
+
+ c0 = c0 & 0x0000FFFF;
+
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ /* Read x[7] */
+ x3 = *__SIMD32(px);
+ px++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALDX(x1, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ if(k == 2u)
+ {
+ /* Read y[4], y[5] */
+ c0 = *__SIMD32(py);
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px + 1);
+ px += 2u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALD(x3, c0, acc2);
+ acc3 = __SMLALD(x2, c0, acc3);
+ }
+
+ if(k == 3u)
+ {
+ /* Read y[4], y[5] */
+ c0 = *__SIMD32(py)++;
+
+ /* Read x[7], x[8] */
+ x3 = *__SIMD32(px);
+
+ /* Read x[9] */
+ x2 = _SIMD32_OFFSET(px + 1);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALD(x3, c0, acc2);
+ acc3 = __SMLALD(x2, c0, acc3);
+
+ c0 = (*py);
+
+ /* Read y[6] */
+#ifdef ARM_MATH_BIG_ENDIAN
+
+ c0 = c0 << 16u;
+#else
+
+ c0 = c0 & 0x0000FFFF;
+#endif /* #ifdef ARM_MATH_BIG_ENDIAN */
+ /* Read x[10] */
+ x3 = _SIMD32_OFFSET(px + 2);
+ px += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALDX(x1, c0, acc0);
+ acc1 = __SMLALD(x2, c0, acc1);
+ acc2 = __SMLALDX(x2, c0, acc2);
+ acc3 = __SMLALDX(x3, c0, acc3);
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(acc0 >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc1 >> 15, 16));
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc2 >> 15, 16));
+ pOut += inc;
+
+ *pOut = (q15_t) (__SSAT(acc3 >> 15, 16));
+ pOut += inc;
+
+ /* Increment the count by 4 as 4 output values are computed */
+ count += 4u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) * px++ * *py++);
+ sum += ((q63_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q63_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(sum >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment count by 1, as one output value is computed */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q63_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT(sum >> 15, 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = (pIn1 + srcALen) - (srcBLen - 1u);
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] , sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] , sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum = __SMLALD(*__SIMD32(px)++, *__SIMD32(py)++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum = __SMLALD(*px++, *py++, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q15_t) (__SSAT((sum >> 15), 16));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ q15_t *pIn1 = pSrcA; /* inputA pointer */
+ q15_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0u, j; /* loop counters */
+ uint32_t inv = 0u; /* Reverse order flag */
+ uint32_t tot = 0u; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2u);
+
+ if(srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if(srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1u);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0u; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q31_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+ }
+ }
+ /* Store the output in the destination buffer */
+ if(inv == 1)
+ *pDst-- = (q15_t) __SSAT((sum >> 15u), 16u);
+ else
+ *pDst++ = (q15_t) __SSAT((sum >> 15u), 16u);
+ }
+
+#endif /*#if (defined(ARM_MATH_CM4) || defined(ARM_MATH_CM3)) && !defined(UNALIGNED_SUPPORT_DISABLE) */
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c
new file mode 100644
index 0000000..f4a6d69
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q31.c
@@ -0,0 +1,663 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_q31.c
+*
+* Description: Correlation of Q31 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q31 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * Thus, if the accumulator overflows it wraps around and distorts the result.
+ * The input signals should be scaled down to avoid intermediate overflows.
+ * Scale down one of the inputs by 1/min(srcALen, srcBLen)to avoid overflows since a
+ * maximum of min(srcALen, srcBLen) number of additions is carried internally.
+ * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ *
+ * \par
+ * See arm_correlate_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ */
+
+void arm_correlate_q31(
+ q31_t * pSrcA,
+ uint32_t srcALen,
+ q31_t * pSrcB,
+ uint32_t srcBLen,
+ q31_t * pDst)
+{
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t *pIn1; /* inputA pointer */
+ q31_t *pIn2; /* inputB pointer */
+ q31_t *pOut = pDst; /* output pointer */
+ q31_t *px; /* Intermediate inputA pointer */
+ q31_t *py; /* Intermediate inputB pointer */
+ q31_t *pSrc1; /* Intermediate pointers */
+ q63_t sum, acc0, acc1, acc2; /* Accumulators */
+ q31_t x0, x1, x2, c0; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1; /* Destination address modifier */
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] * y[srcBLen - 4] */
+ sum += (q63_t) * px++ * (*py++);
+ /* x[1] * y[srcBLen - 3] */
+ sum += (q63_t) * px++ * (*py++);
+ /* x[2] * y[srcBLen - 2] */
+ sum += (q63_t) * px++ * (*py++);
+ /* x[3] * y[srcBLen - 1] */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll by 3 */
+ blkCnt = blockSize2 / 3;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* read x[0], x[1] samples */
+ x0 = *(px++);
+ x1 = *(px++);
+
+ /* Apply loop unrolling and compute 3 MACs simultaneously. */
+ k = srcBLen / 3;
+
+ /* First part of the processing with loop unrolling. Compute 3 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 2 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *(py);
+
+ /* Read x[2] sample */
+ x2 = *(px);
+
+ /* Perform the multiply-accumulate */
+ /* acc0 += x[0] * y[0] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[1] * y[0] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[2] * y[0] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read y[1] sample */
+ c0 = *(py + 1u);
+
+ /* Read x[3] sample */
+ x0 = *(px + 1u);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[1] * y[1] */
+ acc0 += ((q63_t) x1 * c0);
+ /* acc1 += x[2] * y[1] */
+ acc1 += ((q63_t) x2 * c0);
+ /* acc2 += x[3] * y[1] */
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read y[2] sample */
+ c0 = *(py + 2u);
+
+ /* Read x[4] sample */
+ x1 = *(px + 2u);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[2] * y[2] */
+ acc0 += ((q63_t) x2 * c0);
+ /* acc1 += x[3] * y[2] */
+ acc1 += ((q63_t) x0 * c0);
+ /* acc2 += x[4] * y[2] */
+ acc2 += ((q63_t) x1 * c0);
+
+ /* update scratch pointers */
+ px += 3u;
+ py += 3u;
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 3, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen - (3 * (srcBLen / 3));
+
+ while(k > 0u)
+ {
+ /* Read y[4] sample */
+ c0 = *(py++);
+
+ /* Read x[7] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 += ((q63_t) x0 * c0);
+ /* acc1 += x[5] * y[4] */
+ acc1 += ((q63_t) x1 * c0);
+ /* acc2 += x[6] * y[4] */
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (acc0 >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q31_t) (acc1 >> 31);
+ pOut += inc;
+
+ *pOut = (q31_t) (acc2 >> 31);
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 3 */
+ count += 3u;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 3, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 - 3 * (blockSize2 / 3);
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) * px++ * (*py++);
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum += (q63_t) * px++ * (*py++);
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ sum += (q63_t) * px++ * (*py++);
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum += (q63_t) * px++ * (*py++);
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * px++ * (*py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q31_t) (sum >> 31);
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t *pIn1 = pSrcA; /* inputA pointer */
+ q31_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
+ q63_t sum; /* Accumulators */
+ uint32_t i = 0u, j; /* loop counters */
+ uint32_t inv = 0u; /* Reverse order flag */
+ uint32_t tot = 0u; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using correlation but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2u);
+
+ if(srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if(srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1u);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate correlation for output length number of times */
+ for (i = 0u; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to correlation equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q63_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+ }
+ }
+ /* Store the output in the destination buffer */
+ if(inv == 1)
+ *pDst-- = (q31_t) (sum >> 31u);
+ else
+ *pDst++ = (q31_t) (sum >> 31u);
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c
new file mode 100644
index 0000000..eba885a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_correlate_q7.c
@@ -0,0 +1,788 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_correlate_q7.c
+*
+* Description: Correlation of Q7 sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.11 2011/10/18
+* Bug Fix in conv, correlation, partial convolution.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup Corr
+ * @{
+ */
+
+/**
+ * @brief Correlation of Q7 sequences.
+ * @param[in] *pSrcA points to the first input sequence.
+ * @param[in] srcALen length of the first input sequence.
+ * @param[in] *pSrcB points to the second input sequence.
+ * @param[in] srcBLen length of the second input sequence.
+ * @param[out] *pDst points to the location where the output result is written. Length 2 * max(srcALen, srcBLen) - 1.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both the inputs are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * This approach provides 17 guard bits and there is no risk of overflow as long as max(srcALen, srcBLen)<131072.
+ * The 18.14 result is then truncated to 18.7 format by discarding the low 7 bits and saturated to 1.7 format.
+ *
+ * \par
+ * Refer the function arm_correlate_opt_q7() for a faster implementation of this function.
+ *
+ */
+
+void arm_correlate_q7(
+ q7_t * pSrcA,
+ uint32_t srcALen,
+ q7_t * pSrcB,
+ uint32_t srcBLen,
+ q7_t * pDst)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t *pIn1; /* inputA pointer */
+ q7_t *pIn2; /* inputB pointer */
+ q7_t *pOut = pDst; /* output pointer */
+ q7_t *px; /* Intermediate inputA pointer */
+ q7_t *py; /* Intermediate inputB pointer */
+ q7_t *pSrc1; /* Intermediate pointers */
+ q31_t sum, acc0, acc1, acc2, acc3; /* Accumulators */
+ q31_t input1, input2; /* temporary variables */
+ q15_t in1, in2; /* temporary variables */
+ q7_t x0, x1, x2, x3, c0, c1; /* temporary variables for holding input and coefficient values */
+ uint32_t j, k = 0u, count, blkCnt, outBlockSize, blockSize1, blockSize2, blockSize3; /* loop counter */
+ int32_t inc = 1;
+
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and the destination pointer modifier, inc is set to -1 */
+ /* If srcALen > srcBLen, zero pad has to be done to srcB to make the two inputs of same length */
+ /* But to improve the performance,
+ * we include zeroes in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the starting of the output buffer */
+ /* If srcALen < srcBLen,
+ * (srcALen - srcBLen) zeroes has to included in the ending of the output buffer */
+ if(srcALen >= srcBLen)
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcA);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcB);
+
+ /* Number of output samples is calculated */
+ outBlockSize = (2u * srcALen) - 1u;
+
+ /* When srcALen > srcBLen, zero padding is done to srcB
+ * to make their lengths equal.
+ * Instead, (outBlockSize - (srcALen + srcBLen - 1))
+ * number of output samples are made zero */
+ j = outBlockSize - (srcALen + (srcBLen - 1u));
+
+ /* Updating the pointer position to non zero value */
+ pOut += j;
+
+ }
+ else
+ {
+ /* Initialization of inputA pointer */
+ pIn1 = (pSrcB);
+
+ /* Initialization of inputB pointer */
+ pIn2 = (pSrcA);
+
+ /* srcBLen is always considered as shorter or equal to srcALen */
+ j = srcBLen;
+ srcBLen = srcALen;
+ srcALen = j;
+
+ /* CORR(x, y) = Reverse order(CORR(y, x)) */
+ /* Hence set the destination pointer to point to the last output sample */
+ pOut = pDst + ((srcALen + srcBLen) - 2u);
+
+ /* Destination address modifier is set to -1 */
+ inc = -1;
+
+ }
+
+ /* The function is internally
+ * divided into three parts according to the number of multiplications that has to be
+ * taken place between inputA samples and inputB samples. In the first part of the
+ * algorithm, the multiplications increase by one for every iteration.
+ * In the second part of the algorithm, srcBLen number of multiplications are done.
+ * In the third part of the algorithm, the multiplications decrease by one
+ * for every iteration.*/
+ /* The algorithm is implemented in three stages.
+ * The loop counters of each stage is initiated here. */
+ blockSize1 = srcBLen - 1u;
+ blockSize2 = srcALen - (srcBLen - 1u);
+ blockSize3 = blockSize1;
+
+ /* --------------------------
+ * Initializations of stage1
+ * -------------------------*/
+
+ /* sum = x[0] * y[srcBlen - 1]
+ * sum = x[0] * y[srcBlen - 2] + x[1] * y[srcBlen - 1]
+ * ....
+ * sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen - 1] * y[srcBLen - 1]
+ */
+
+ /* In this stage the MAC operations are increased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = 1u;
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ pSrc1 = pIn2 + (srcBLen - 1u);
+ py = pSrc1;
+
+ /* ------------------------
+ * Stage1 process
+ * ----------------------*/
+
+ /* The first stage starts here */
+ while(blockSize1 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[0] , x[1] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 4] , y[srcBLen - 3] */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[0] * y[srcBLen - 4] */
+ /* x[1] * y[srcBLen - 3] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[2] , x[3] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[srcBLen - 2] , y[srcBLen - 1] */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* x[2] * y[srcBLen - 2] */
+ /* x[3] * y[srcBLen - 1] */
+ sum = __SMLAD(input1, input2, sum);
+
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ /* x[0] * y[srcBLen - 1] */
+ sum += (q31_t) ((q15_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ py = pSrc1 - count;
+ px = pIn1;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Decrement the loop counter */
+ blockSize1--;
+ }
+
+ /* --------------------------
+ * Initializations of stage2
+ * ------------------------*/
+
+ /* sum = x[0] * y[0] + x[1] * y[1] +...+ x[srcBLen-1] * y[srcBLen-1]
+ * sum = x[1] * y[0] + x[2] * y[1] +...+ x[srcBLen] * y[srcBLen-1]
+ * ....
+ * sum = x[srcALen-srcBLen-2] * y[0] + x[srcALen-srcBLen-1] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ */
+
+ /* Working pointer of inputA */
+ px = pIn1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* count is index by which the pointer pIn1 to be incremented */
+ count = 0u;
+
+ /* -------------------
+ * Stage2 process
+ * ------------------*/
+
+ /* Stage2 depends on srcBLen as in this stage srcBLen number of MACS are performed.
+ * So, to loop unroll over blockSize2,
+ * srcBLen should be greater than or equal to 4 */
+ if(srcBLen >= 4u)
+ {
+ /* Loop unroll over blockSize2, by 4 */
+ blkCnt = blockSize2 >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* read x[0], x[1], x[2] samples */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ do
+ {
+ /* Read y[0] sample */
+ c0 = *py++;
+ /* Read y[1] sample */
+ c1 = *py++;
+
+ /* Read x[3] sample */
+ x3 = *px++;
+
+ /* x[0] and x[1] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[0] and y[1] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[0] * y[0] + x[1] * y[1] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[1] and x[2] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[1] * y[0] + x[2] * y[1] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[2] * y[0] + x[3] * y[1] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[4] sample */
+ x0 = *(px++);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[3] * y[0] + x[4] * y[1] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ /* Read y[2] sample */
+ c0 = *py++;
+ /* Read y[3] sample */
+ c1 = *py++;
+
+ /* Read x[5] sample */
+ x1 = *px++;
+
+ /* x[2] and x[3] are packed */
+ in1 = (q15_t) x2;
+ in2 = (q15_t) x3;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[2] and y[3] are packed */
+ in1 = (q15_t) c0;
+ in2 = (q15_t) c1;
+
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc0 += x[2] * y[2] + x[3] * y[3] */
+ acc0 = __SMLAD(input1, input2, acc0);
+
+ /* x[3] and x[4] are packed */
+ in1 = (q15_t) x3;
+ in2 = (q15_t) x0;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc1 += x[3] * y[2] + x[4] * y[3] */
+ acc1 = __SMLAD(input1, input2, acc1);
+
+ /* x[4] and x[5] are packed */
+ in1 = (q15_t) x0;
+ in2 = (q15_t) x1;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc2 += x[4] * y[2] + x[5] * y[3] */
+ acc2 = __SMLAD(input1, input2, acc2);
+
+ /* Read x[6] sample */
+ x2 = *px++;
+
+ /* x[5] and x[6] are packed */
+ in1 = (q15_t) x1;
+ in2 = (q15_t) x2;
+
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* acc3 += x[5] * y[2] + x[6] * y[3] */
+ acc3 = __SMLAD(input1, input2, acc3);
+
+ } while(--k);
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Read y[4] sample */
+ c0 = *py++;
+
+ /* Read x[7] sample */
+ x3 = *px++;
+
+ /* Perform the multiply-accumulates */
+ /* acc0 += x[4] * y[4] */
+ acc0 += ((q15_t) x0 * c0);
+ /* acc1 += x[5] * y[4] */
+ acc1 += ((q15_t) x1 * c0);
+ /* acc2 += x[6] * y[4] */
+ acc2 += ((q15_t) x2 * c0);
+ /* acc3 += x[7] * y[4] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present samples for the next MAC */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(acc0 >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc1 >> 7, 8));
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc2 >> 7, 8));
+ pOut += inc;
+
+ *pOut = (q7_t) (__SSAT(acc3 >> 7, 8));
+ pOut += inc;
+
+ count += 4u;
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize2 is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize2 % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = srcBLen >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Reading two inputs of SrcA buffer and packing */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Reading two inputs of SrcB buffer and packing */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* Perform the multiply-accumulates */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the srcBLen is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = srcBLen % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q15_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the pointer pIn1 index, count by 1 */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+ else
+ {
+ /* If the srcBLen is not a multiple of 4,
+ * the blockSize2 loop cannot be unrolled by 4 */
+ blkCnt = blockSize2;
+
+ while(blkCnt > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Loop over srcBLen */
+ k = srcBLen;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += ((q15_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Increment the MAC count */
+ count++;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = pIn1 + count;
+ py = pIn2;
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+ }
+
+ /* --------------------------
+ * Initializations of stage3
+ * -------------------------*/
+
+ /* sum += x[srcALen-srcBLen+1] * y[0] + x[srcALen-srcBLen+2] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * sum += x[srcALen-srcBLen+2] * y[0] + x[srcALen-srcBLen+3] * y[1] +...+ x[srcALen-1] * y[srcBLen-1]
+ * ....
+ * sum += x[srcALen-2] * y[0] + x[srcALen-1] * y[1]
+ * sum += x[srcALen-1] * y[0]
+ */
+
+ /* In this stage the MAC operations are decreased by 1 for every iteration.
+ The count variable holds the number of MAC operations performed */
+ count = srcBLen - 1u;
+
+ /* Working pointer of inputA */
+ pSrc1 = pIn1 + (srcALen - (srcBLen - 1u));
+ px = pSrc1;
+
+ /* Working pointer of inputB */
+ py = pIn2;
+
+ /* -------------------
+ * Stage3 process
+ * ------------------*/
+
+ while(blockSize3 > 0u)
+ {
+ /* Accumulator is made zero for every iteration */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ k = count >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 MACs at a time.
+ ** a second loop below computes MACs for the remaining 1 to 3 samples. */
+ while(k > 0u)
+ {
+ /* x[srcALen - srcBLen + 1] , x[srcALen - srcBLen + 2] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[0] , y[1] */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 1] * y[0] */
+ /* sum += x[srcALen - srcBLen + 2] * y[1] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* x[srcALen - srcBLen + 3] , x[srcALen - srcBLen + 4] */
+ in1 = (q15_t) * px++;
+ in2 = (q15_t) * px++;
+ input1 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* y[2] , y[3] */
+ in1 = (q15_t) * py++;
+ in2 = (q15_t) * py++;
+ input2 = ((q31_t) in1 & 0x0000FFFF) | ((q31_t) in2 << 16);
+
+ /* sum += x[srcALen - srcBLen + 3] * y[2] */
+ /* sum += x[srcALen - srcBLen + 4] * y[3] */
+ sum = __SMLAD(input1, input2, sum);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* If the count is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ k = count % 0x4u;
+
+ while(k > 0u)
+ {
+ /* Perform the multiply-accumulates */
+ sum += ((q15_t) * px++ * *py++);
+
+ /* Decrement the loop counter */
+ k--;
+ }
+
+ /* Store the result in the accumulator in the destination buffer. */
+ *pOut = (q7_t) (__SSAT(sum >> 7, 8));
+ /* Destination pointer is updated according to the address modifier, inc */
+ pOut += inc;
+
+ /* Update the inputA and inputB pointers for next MAC calculation */
+ px = ++pSrc1;
+ py = pIn2;
+
+ /* Decrement the MAC count */
+ count--;
+
+ /* Decrement the loop counter */
+ blockSize3--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ q7_t *pIn1 = pSrcA; /* inputA pointer */
+ q7_t *pIn2 = pSrcB + (srcBLen - 1u); /* inputB pointer */
+ q31_t sum; /* Accumulator */
+ uint32_t i = 0u, j; /* loop counters */
+ uint32_t inv = 0u; /* Reverse order flag */
+ uint32_t tot = 0u; /* Length */
+
+ /* The algorithm implementation is based on the lengths of the inputs. */
+ /* srcB is always made to slide across srcA. */
+ /* So srcBLen is always considered as shorter or equal to srcALen */
+ /* But CORR(x, y) is reverse of CORR(y, x) */
+ /* So, when srcBLen > srcALen, output pointer is made to point to the end of the output buffer */
+ /* and a varaible, inv is set to 1 */
+ /* If lengths are not equal then zero pad has to be done to make the two
+ * inputs of same length. But to improve the performance, we include zeroes
+ * in the output instead of zero padding either of the the inputs*/
+ /* If srcALen > srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * starting of the output buffer */
+ /* If srcALen < srcBLen, (srcALen - srcBLen) zeroes has to included in the
+ * ending of the output buffer */
+ /* Once the zero padding is done the remaining of the output is calcualted
+ * using convolution but with the shorter signal time shifted. */
+
+ /* Calculate the length of the remaining sequence */
+ tot = ((srcALen + srcBLen) - 2u);
+
+ if(srcALen > srcBLen)
+ {
+ /* Calculating the number of zeros to be padded to the output */
+ j = srcALen - srcBLen;
+
+ /* Initialise the pointer after zero padding */
+ pDst += j;
+ }
+
+ else if(srcALen < srcBLen)
+ {
+ /* Initialization to inputB pointer */
+ pIn1 = pSrcB;
+
+ /* Initialization to the end of inputA pointer */
+ pIn2 = pSrcA + (srcALen - 1u);
+
+ /* Initialisation of the pointer after zero padding */
+ pDst = pDst + tot;
+
+ /* Swapping the lengths */
+ j = srcALen;
+ srcALen = srcBLen;
+ srcBLen = j;
+
+ /* Setting the reverse flag */
+ inv = 1;
+
+ }
+
+ /* Loop to calculate convolution for output length number of times */
+ for (i = 0u; i <= tot; i++)
+ {
+ /* Initialize sum with zero to carry on MAC operations */
+ sum = 0;
+
+ /* Loop to perform MAC operations according to convolution equation */
+ for (j = 0u; j <= i; j++)
+ {
+ /* Check the array limitations */
+ if((((i - j) < srcBLen) && (j < srcALen)))
+ {
+ /* z[i] += x[i-j] * y[j] */
+ sum += ((q15_t) pIn1[j] * pIn2[-((int32_t) i - j)]);
+ }
+ }
+ /* Store the output in the destination buffer */
+ if(inv == 1)
+ *pDst-- = (q7_t) __SSAT((sum >> 7u), 8u);
+ else
+ *pDst++ = (q7_t) __SSAT((sum >> 7u), 8u);
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of Corr group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c
new file mode 100644
index 0000000..4d1d709
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_f32.c
@@ -0,0 +1,517 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_f32.c
+*
+* Description: FIR decimation for floating-point sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup FIR_decimate Finite Impulse Response (FIR) Decimator
+ *
+ * These functions combine an FIR filter together with a decimator.
+ * They are used in multirate systems for reducing the sample rate of a signal without introducing aliasing distortion.
+ * Conceptually, the functions are equivalent to the block diagram below:
+ * \image html FIRDecimator.gif "Components included in the FIR Decimator functions"
+ * When decimating by a factor of M, the signal should be prefiltered by a lowpass filter with a normalized
+ * cutoff frequency of 1/M in order to prevent aliasing distortion.
+ * The user of the function is responsible for providing the filter coefficients.
+ *
+ * The FIR decimator functions provided in the CMSIS DSP Library combine the FIR filter and the decimator in an efficient manner.
+ * Instead of calculating all of the FIR filter outputs and discarding M-1 out of every M, only the
+ * samples output by the decimator are computed.
+ * The functions operate on blocks of input and output data.
+ * pSrc points to an array of blockSize input values and
+ * pDst points to an array of blockSize/M output values.
+ * In order to have an integer number of output samples blockSize
+ * must always be a multiple of the decimation factor M.
+ *
+ * The library provides separate functions for Q15, Q31 and floating-point data types.
+ *
+ * \par Algorithm:
+ * The FIR portion of the algorithm uses the standard form filter:
+ *
+ * where, b[n] are the filter coefficients.
+ * \par
+ * The pCoeffs points to a coefficient array of size numTaps.
+ * Coefficients are stored in time reversed order.
+ * \par
+ *
+ * The state variables are updated after each block of data is processed, the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * - Checks to make sure that the size of the input is a multiple of the decimation factor.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ *
+ *arm_fir_decimate_instance_f32 S = {M, numTaps, pCoeffs, pState};
+ *arm_fir_decimate_instance_q31 S = {M, numTaps, pCoeffs, pState};
+ *arm_fir_decimate_instance_q15 S = {M, numTaps, pCoeffs, pState};
+ *
+ * where M is the decimation factor; numTaps is the number of filter coefficients in the filter;
+ * pCoeffs is the address of the coefficient buffer;
+ * pState is the address of the state buffer.
+ * Be sure to set the values in the state buffer to zeros when doing static initialization.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR decimate filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+ /**
+ * @brief Processing function for the floating-point FIR decimator.
+ * @param[in] *S points to an instance of the floating-point FIR decimator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void arm_fir_decimate_f32(
+ const arm_fir_decimate_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t sum0; /* Accumulator */
+ float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+#ifndef ARM_MATH_CM0
+
+ uint32_t blkCntN4;
+ float32_t *px0, *px1, *px2, *px3;
+ float32_t acc0, acc1, acc2, acc3;
+ float32_t x1, x2, x3;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 4;
+ blkCntN4 = outBlockSize - (4 * blkCnt);
+
+ while(blkCnt > 0u)
+ {
+ /* Copy 4 * decimation factor number of new input samples into the state buffer */
+ i = 4 * S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Initialize state pointer for all the samples */
+ px0 = pState;
+ px1 = pState + S->M;
+ px2 = pState + 2 * S->M;
+ px3 = pState + 3 * S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+
+ while(tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample for acc0 */
+ x0 = *(px0++);
+ /* Read x[n-numTaps-1] sample for acc1 */
+ x1 = *(px1++);
+ /* Read x[n-numTaps-1] sample for acc2 */
+ x2 = *(px2++);
+ /* Read x[n-numTaps-1] sample for acc3 */
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch state variables for acc0, acc1, acc2, acc3 */
+ x0 = *(px0++);
+ x1 = *(px1++);
+ x2 = *(px2++);
+ x3 = *(px3++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + 4 * S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ while(blkCntN4 > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1u) >> 2;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1u) % 0x04u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy numTaps number of values */
+ i = (numTaps - 1u);
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_decimate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
new file mode 100644
index 0000000..81cc7df
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q15.c
@@ -0,0 +1,589 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_fast_q15.c
+*
+* Description: Fast Q15 FIR Decimator.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+ * @param[in] *S points to an instance of the Q15 FIR decimator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, state buffers should be aligned by 32-bit
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (log2 is read as log to the base 2).
+ * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+ *
+ * \par
+ * Refer to the function arm_fir_decimate_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion.
+ * Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_fir_decimate_init_q15() to initialize the filter structure.
+ */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+void arm_fir_decimate_fast_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ q15_t *pb; /* Temporary pointer coefficient buffer */
+ q31_t x0, x1, c0, c1; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+
+ while(blkCnt > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = 2 * S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+
+ px1 = pState + S->M;
+
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
+ x0 = *__SIMD32(px0)++;
+
+ x1 = *__SIMD32(px1)++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
+ c0 = *__SIMD32(pb)++;
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = *__SIMD32(px0)++;
+
+ x1 = *__SIMD32(px1)++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc1 = __SMLAD(x1, c0, acc1);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+
+ while(blkCntN3 > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /*Set sum to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the Read b[numTaps-1] and b[numTaps-2] coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* Read x[n-numTaps-1] and x[n-numTaps-2]sample */
+ x0 = *__SIMD32(px)++;
+
+ /* Read the b[numTaps-3] and b[numTaps-4] coefficient */
+ c1 = *__SIMD32(pb)++;
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c0, sum0);
+
+ /* Read x[n-numTaps-2] and x[n-numTaps-3] sample */
+ x0 = *__SIMD32(px)++;
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c1, sum0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 = __SMLAD(x0, c0, sum0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement the loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1u) % 0x04u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+
+#else
+
+
+void arm_fir_decimate_fast_q15(
+ const arm_fir_decimate_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px; /* Temporary pointer for state buffer */
+ q15_t *pb; /* Temporary pointer coefficient buffer */
+ q15_t x0, x1, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t sum0; /* Accumulators */
+ q31_t acc0, acc1;
+ q15_t *px0, *px1;
+ uint32_t blkCntN3;
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, blkCnt, tapCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+
+ /* Total number of output samples to be computed */
+ blkCnt = outBlockSize / 2;
+ blkCntN3 = outBlockSize - (2 * blkCnt);
+
+ while(blkCnt > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = 2 * S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+
+ px1 = pState + S->M;
+
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the Read b[numTaps-1] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] for sample 0 and for sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] for sample 0 and sample 1 */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px0++;
+ x1 = *px1++;
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *pDst++ = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ while(blkCntN3 > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /*Set sum to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the Read b[numTaps-1] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-1] and sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-2] and sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *pb++;
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *pb++;
+
+ /* Fetch 1 state variable */
+ x0 = *px++;
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* Store filter output, smlad returns the values in 2.14 format */
+ /* so downsacle by 15 to get output in 1.15 */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ /* Decrement the loop counter */
+ blkCntN3--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1u) % 0x04u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+/**
+ * @} end of FIR_decimate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
new file mode 100644
index 0000000..63587c7
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_fast_q31.c
@@ -0,0 +1,342 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_fast_q31.c
+*
+* Description: Fast Q31 FIR Decimator.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q31 FIR decimator (fast variant) for Cortex-M3 and Cortex-M4.
+ * @param[in] *S points to an instance of the Q31 FIR decimator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ * These intermediate results are added to a 2.30 accumulator.
+ * Finally, the accumulator is saturated and converted to a 1.31 result.
+ * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits (where log2 is read as log to the base 2).
+ *
+ * \par
+ * Refer to the function arm_fir_decimate_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision.
+ * Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_fir_decimate_init_q31() to initialize the filter structure.
+ */
+
+void arm_fir_decimate_fast_q31(
+ arm_fir_decimate_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ q31_t *px; /* Temporary pointers for state buffer */
+ q31_t *pb; /* Temporary pointers for coefficient buffer */
+ q31_t sum0; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of taps */
+ uint32_t i, tapCnt, blkCnt, outBlockSize = blockSize / S->M; /* Loop counters */
+ uint32_t blkCntN2;
+ q31_t x1;
+ q31_t acc0, acc1;
+ q31_t *px0, *px1;
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+ /* Total number of output samples to be computed */
+
+ blkCnt = outBlockSize / 2;
+ blkCntN2 = outBlockSize - (2 * blkCnt);
+
+ while(blkCnt > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = 2 * S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ px0 = pState;
+ px1 = pState + S->M;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb);
+
+ /* Read x[n-numTaps-1] for sample 0 sample 1 */
+ x0 = *(px0);
+ x1 = *(px1);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb + 1u);
+
+ /* Read x[n-numTaps-2] for sample 0 sample 1 */
+ x0 = *(px0 + 1u);
+ x1 = *(px1 + 1u);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb + 2u);
+
+ /* Read x[n-numTaps-3] for sample 0 sample 1 */
+ x0 = *(px0 + 2u);
+ x1 = *(px1 + 2u);
+ pb += 4u;
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb - 1u);
+
+ /* Read x[n-numTaps-4] for sample 0 sample 1 */
+ x0 = *(px0 + 3u);
+ x1 = *(px1 + 3u);
+
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* update state pointers */
+ px0 += 4u;
+ px1 += 4u;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x0 = *(px0++);
+ x1 = *(px1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M * 2;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+ *pDst++ = (q31_t) (acc1 << 1);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ while(blkCntN2 > 0u)
+ {
+ /* Copy decimation factor number of new input samples into the state buffer */
+ i = S->M;
+
+ do
+ {
+ *pStateCurnt++ = *pSrc++;
+
+ } while(--i);
+
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-1] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-2] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ sum0 = (q31_t) ((((q63_t) sum0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by the decimation factor
+ * to process the next group of decimation factor number samples */
+ pState = pState + S->M;
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (sum0 << 1);
+
+ /* Decrement the loop counter */
+ blkCntN2--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = (numTaps - 1u) % 0x04u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+
+/**
+ * @} end of FIR_decimate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
new file mode 100644
index 0000000..41e2eb3
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_f32.c
@@ -0,0 +1,111 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_init_f32.c
+*
+* Description: Floating-point FIR Decimator initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point FIR decimator.
+ * @param[in,out] *S points to an instance of the floating-point FIR decimator structure.
+ * @param[in] numTaps number of coefficients in the filter.
+ * @param[in] M decimation factor.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * blockSize is not a multiple of M.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples passed to arm_fir_decimate_f32().
+ * M is the decimation factor.
+ */
+
+arm_status arm_fir_decimate_init_f32(
+ arm_fir_decimate_instance_f32 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if((blockSize % M) != 0u)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation Factor */
+ S->M = M;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+/**
+ * @} end of FIR_decimate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
new file mode 100644
index 0000000..7b168cc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q15.c
@@ -0,0 +1,113 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_init_q15.c
+*
+* Description: Initialization function for the Q15 FIR Decimator.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q15 FIR decimator.
+ * @param[in,out] *S points to an instance of the Q15 FIR decimator structure.
+ * @param[in] numTaps number of coefficients in the filter.
+ * @param[in] M decimation factor.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * blockSize is not a multiple of M.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length numTaps+blockSize-1 words where blockSize is the number of input samples
+ * to the call arm_fir_decimate_q15().
+ * M is the decimation factor.
+ */
+
+arm_status arm_fir_decimate_init_q15(
+ arm_fir_decimate_instance_q15 * S,
+ uint16_t numTaps,
+ uint8_t M,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+
+ arm_status status;
+
+ /* The size of the input block must be a multiple of the decimation factor */
+ if((blockSize % M) != 0u)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size of buffer is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Decimation factor */
+ S->M = M;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+/**
+ * @} end of FIR_decimate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
new file mode 100644
index 0000000..494b744
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_decimate_init_q31.c
@@ -0,0 +1,111 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_decimate_init_q31.c
+*
+* Description: Initialization function for Q31 FIR Decimation filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_decimate
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q31 FIR decimator.
+ * @param[in,out] *S points to an instance of the Q31 FIR decimator structure.
+ * @param[in] numTaps number of coefficients in the filter.
+ * @param[in] M decimation factor.
+ * @param[in] *pCoeffs points to the filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * blockSize is not a multiple of M.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * \par
+ * pState points to a state array of size numTaps + blockSize - 1.
+ * Samples in the state buffer are stored in the following order.
+ * \par
+ *
+ * \par
+ * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1.
+ * The increased state buffer length allows circular addressing, which is traditionally used in the FIR filters,
+ * to be avoided and yields a significant speed improvement.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 4 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 4 different data type filter instance structures
+ *
+ *arm_fir_instance_f32 S = {numTaps, pState, pCoeffs};
+ *arm_fir_instance_q31 S = {numTaps, pState, pCoeffs};
+ *arm_fir_instance_q15 S = {numTaps, pState, pCoeffs};
+ *arm_fir_instance_q7 S = {numTaps, pState, pCoeffs};
+ *
+ *
+ * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ * pCoeffs is the address of the coefficient buffer.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ *
+ * @param[in] *S points to an instance of the floating-point FIR filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_fir_f32(
+ const arm_fir_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t acc0, acc1, acc2, acc3, acc4, acc5, acc6, acc7; /* Accumulators */
+ float32_t x0, x1, x2, x3, x4, x5, x6, x7, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 3;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+ acc4 = 0.0f;
+ acc5 = 0.0f;
+ acc6 = 0.0f;
+ acc7 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the first three samples from the state buffer: x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *px++;
+ x1 = *px++;
+ x2 = *px++;
+ x3 = *px++;
+ x4 = *px++;
+ x5 = *px++;
+ x6 = *px++;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 3u;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ while(tapCnt > 0u)
+ {
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x7 = *(px++);
+
+ /* acc0 += b[numTaps-1] * x[n-numTaps] */
+ acc0 += x0 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-1] */
+ acc1 += x1 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-2] */
+ acc2 += x2 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-3] */
+ acc3 += x3 * c0;
+
+ /* acc4 += b[numTaps-1] * x[n-numTaps-4] */
+ acc4 += x4 * c0;
+
+ /* acc1 += b[numTaps-1] * x[n-numTaps-5] */
+ acc5 += x5 * c0;
+
+ /* acc2 += b[numTaps-1] * x[n-numTaps-6] */
+ acc6 += x6 * c0;
+
+ /* acc3 += b[numTaps-1] * x[n-numTaps-7] */
+ acc7 += x7 * c0;
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c0;
+ acc1 += x2 * c0;
+ acc2 += x3 * c0;
+ acc3 += x4 * c0;
+ acc4 += x5 * c0;
+ acc5 += x6 * c0;
+ acc6 += x7 * c0;
+ acc7 += x0 * c0;
+
+ /* Read the b[numTaps-3] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x2 * c0;
+ acc1 += x3 * c0;
+ acc2 += x4 * c0;
+ acc3 += x5 * c0;
+ acc4 += x6 * c0;
+ acc5 += x7 * c0;
+ acc6 += x0 * c0;
+ acc7 += x1 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x3 * c0;
+ acc1 += x4 * c0;
+ acc2 += x5 * c0;
+ acc3 += x6 * c0;
+ acc4 += x7 * c0;
+ acc5 += x0 * c0;
+ acc6 += x1 * c0;
+ acc7 += x2 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x4 * c0;
+ acc1 += x5 * c0;
+ acc2 += x6 * c0;
+ acc3 += x7 * c0;
+ acc4 += x0 * c0;
+ acc5 += x1 * c0;
+ acc6 += x2 * c0;
+ acc7 += x3 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x4 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x5 * c0;
+ acc1 += x6 * c0;
+ acc2 += x7 * c0;
+ acc3 += x0 * c0;
+ acc4 += x1 * c0;
+ acc5 += x2 * c0;
+ acc6 += x3 * c0;
+ acc7 += x4 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x5 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x6 * c0;
+ acc1 += x7 * c0;
+ acc2 += x0 * c0;
+ acc3 += x1 * c0;
+ acc4 += x2 * c0;
+ acc5 += x3 * c0;
+ acc6 += x4 * c0;
+ acc7 += x5 * c0;
+
+ /* Read the b[numTaps-4] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x6 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x7 * c0;
+ acc1 += x0 * c0;
+ acc2 += x1 * c0;
+ acc3 += x2 * c0;
+ acc4 += x3 * c0;
+ acc5 += x4 * c0;
+ acc6 += x5 * c0;
+ acc7 += x6 * c0;
+
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x8u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x7 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+ acc4 += x4 * c0;
+ acc5 += x5 * c0;
+ acc6 += x6 * c0;
+ acc7 += x7 * c0;
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+ x3 = x4;
+ x4 = x5;
+ x5 = x6;
+ x6 = x7;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 8;
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ *pDst++ = acc0;
+ *pDst++ = acc1;
+ *pDst++ = acc2;
+ *pDst++ = acc3;
+ *pDst++ = acc4;
+ *pDst++ = acc5;
+ *pDst++ = acc6;
+ *pDst++ = acc7;
+
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x8u;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = (pCoeffs);
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ acc0 += *px++ * *pb++;
+ i--;
+
+ } while(i > 0u);
+
+ /* The result is store in the destination buffer. */
+ *pDst++ = acc0;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+}
+
+#else
+
+void arm_fir_f32(
+ const arm_fir_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t acc;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Initialize blkCnt with blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc = 0.0f;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc += *px++ * *pb++;
+ i--;
+
+ } while(i > 0u);
+
+ /* The result is store in the destination buffer. */
+ *pDst++ = acc;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy numTaps number of values */
+ tapCnt = numTaps - 1u;
+
+ /* Copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c
new file mode 100644
index 0000000..2ce83a8
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q15.c
@@ -0,0 +1,340 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_fast_q15.c
+*
+* Description: Q15 Fast FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.9 2010/08/16
+* Initial version
+*
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @param[in] *S points to an instance of the Q15 FIR filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * This fast version uses a 32-bit accumulator with 2.30 format.
+ * The accumulator maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ * The 2.30 accumulator is then truncated to 2.15 format and saturated to yield the 1.15 result.
+ *
+ * \par
+ * Refer to the function arm_fir_q15() for a slower implementation of this function which uses 64-bit accumulation to avoid wrap around distortion. Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_fir_init_q15() to initialize the filter structure.
+ */
+
+void arm_fir_fast_q15(
+ const arm_fir_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
+ uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer.
+ ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
+ px = pState;
+
+ /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
+ x0 = *__SIMD32(px)++;
+
+ /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
+ x2 = *__SIMD32(px)++;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0)
+ {
+ /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc0 = __SMLAD(x0, c0, acc0);
+
+ /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* pack x[n-N-1] and x[n-N-2] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read state x[n-N-4], x[n-N-5] */
+ x0 = _SIMD32_OFFSET(px);
+
+ /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack x[n-N-3] and x[n-N-4] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ /* Read coefficients b[N-2], b[N-3] */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+ acc0 = __SMLAD(x2, c0, acc0);
+
+ /* Read state x[n-N-6], x[n-N-7] with offset */
+ x2 = _SIMD32_OFFSET(px + 2u);
+
+ /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+ acc2 = __SMLAD(x0, c0, acc2);
+
+ /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack x[n-N-5] and x[n-N-6] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+ acc3 = __SMLADX(x1, c0, acc3);
+
+ /* Update state pointer for next state reading */
+ px += 4u;
+
+ /* Decrement tap count */
+ tapCnt--;
+
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+ ** This is always be 2 taps since the filter length is even. */
+ if((numTaps & 0x3u) != 0u)
+ {
+
+ /* Read last two coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLAD(x0, c0, acc0);
+ acc2 = __SMLAD(x2, c0, acc2);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read last state variables */
+ x0 = *__SIMD32(px);
+
+ /* Perform the multiply-accumulates */
+ acc1 = __SMLADX(x1, c0, acc1);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* Perform the multiply-accumulates */
+ acc3 = __SMLADX(x1, c0, acc3);
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the 4 outputs in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+ while(blkCnt > 0u)
+ {
+ /* Copy two samples into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Use SIMD to hold states and coefficients */
+ px = pState;
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 1u;
+
+ do
+ {
+
+ acc0 += (q31_t) * px++ * *pb++;
+ acc0 += (q31_t) * px++ * *pb++;
+
+ tapCnt--;
+ }
+ while(tapCnt > 0u);
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Calculation of count for copying integer writes */
+ tapCnt = (numTaps - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ tapCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* copy remaining data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c
new file mode 100644
index 0000000..d2e6e37
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_fast_q31.c
@@ -0,0 +1,308 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_fast_q31.c
+*
+* Description: Processing function for the Q31 Fast FIR filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.9 2010/08/27
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @param[in] *S points to an instance of the Q31 structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * This function is optimized for speed at the expense of fixed-point precision and overflow protection.
+ * The result of each 1.31 x 1.31 multiplication is truncated to 2.30 format.
+ * These intermediate results are added to a 2.30 accumulator.
+ * Finally, the accumulator is saturated and converted to a 1.31 result.
+ * The fast version has the same overflow behavior as the standard version and provides less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ *
+ * \par
+ * Refer to the function arm_fir_q31() for a slower implementation of this function which uses a 64-bit accumulator to provide higher precision. Both the slow and the fast versions use the same instance structure.
+ * Use the function arm_fir_init_q31() to initialize the filter structure.
+ */
+
+void arm_fir_fast_q31(
+ const arm_fir_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t x0, x1, x2, x3; /* Temporary variables to hold state */
+ q31_t c0; /* Temporary variable to hold coefficient value */
+ q31_t *px; /* Temporary pointer for state */
+ q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first three samples from the state buffer:
+ * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+ i = tapCnt;
+
+ while(i > 0u)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x3 = *(px++);
+
+ /* acc0 += b[numTaps] * x[n-numTaps] */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* acc1 += b[numTaps] * x[n-numTaps-1] */
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* acc2 += b[numTaps] * x[n-numTaps-2] */
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+
+ /* acc3 += b[numTaps] * x[n-numTaps-3] */
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x3 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x0 * c0)) >> 32);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x3 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x1 * c0)) >> 32);
+
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x3 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x2 * c0)) >> 32);
+ i--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+ i = numTaps - (tapCnt * 4u);
+ while(i > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 = (q31_t) ((((q63_t) acc0 << 32) + ((q63_t) x0 * c0)) >> 32);
+ acc1 = (q31_t) ((((q63_t) acc1 << 32) + ((q63_t) x1 * c0)) >> 32);
+ acc2 = (q31_t) ((((q63_t) acc2 << 32) + ((q63_t) x2 * c0)) >> 32);
+ acc3 = (q31_t) ((((q63_t) acc3 << 32) + ((q63_t) x3 * c0)) >> 32);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4;
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.31
+ ** Then store the 4 outputs in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+ *pDst++ = (q31_t) (acc1 << 1);
+ *pDst++ = (q31_t) (acc2 << 1);
+ *pDst++ = (q31_t) (acc3 << 1);
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = (pCoeffs);
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ acc0 =
+ (q31_t) ((((q63_t) acc0 << 32) +
+ ((q63_t) (*px++) * (*(pb++)))) >> 32);
+ i--;
+ } while(i > 0u);
+
+ /* The result is in 2.30 format. Convert to 1.31
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 << 1);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c
new file mode 100644
index 0000000..de6c133
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_f32.c
@@ -0,0 +1,93 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_init_f32.c
+*
+* Description: Floating-point FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the floating-point FIR filter structure.
+ * @param[in] numTaps Number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of samples that are processed per call.
+ * @return none.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_f32().
+ */
+
+void arm_fir_init_f32(
+ arm_fir_instance_f32 * S,
+ uint16_t numTaps,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and the size of state buffer is (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c
new file mode 100644
index 0000000..6eb71b6
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q15.c
@@ -0,0 +1,151 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_init_q15.c
+*
+* Description: Q15 FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* ------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @param[in,out] *S points to an instance of the Q15 FIR filter structure.
+ * @param[in] numTaps Number of filter coefficients in the filter. Must be even and greater than or equal to 4.
+ * @param[in] *pCoeffs points to the filter coefficients buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize is number of samples processed per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if
+ * numTaps is not greater than or equal to 4 and even.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * Note that numTaps must be even and greater than or equal to 4.
+ * To implement an odd length filter simply increase numTaps by 1 and set the last coefficient to zero.
+ * For example, to implement a filter with numTaps=3 and coefficients
+ *
+ * {0.3, -0.8, 0.3}
+ *
+ * set numTaps=4 and use the coefficients:
+ *
+ * {0.3, -0.8, 0.3, 0}.
+ *
+ * Similarly, to implement a two point filter
+ *
+ * {0.3, -0.3}
+ *
+ * set numTaps=4 and use the coefficients:
+ *
+ * {0.3, -0.3, 0, 0}.
+ *
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length numTaps+blockSize, when running on Cortex-M4 and Cortex-M3 and is of length numTaps+blockSize-1, when running on Cortex-M0 where blockSize is the number of input samples processed by each call to arm_fir_q15().
+ */
+
+arm_status arm_fir_init_q15(
+ arm_fir_instance_q15 * S,
+ uint16_t numTaps,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* The Number of filter coefficients in the filter must be even and at least 4 */
+ if(numTaps & 0x1u)
+ {
+ status = ARM_MATH_ARGUMENT_ERROR;
+ }
+ else
+ {
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps ) */
+ memset(pState, 0, (numTaps + (blockSize)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+
+ return (status);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c
new file mode 100644
index 0000000..01b2cc4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_init_q31.c
@@ -0,0 +1,93 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_init_q31.c
+*
+* Description: Q31 FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @details
+ *
+ * @param[in,out] *S points to an instance of the Q31 FIR filter structure.
+ * @param[in] numTaps Number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficients buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of samples that are processed per call.
+ * @return none.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length numTaps+blockSize-1 samples, where blockSize is the number of input samples processed by each call to arm_fir_q7().
+ */
+
+void arm_fir_init_q7(
+ arm_fir_instance_q7 * S,
+ uint16_t numTaps,
+ q7_t * pCoeffs,
+ q7_t * pState,
+ uint32_t blockSize)
+{
+
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear the state buffer. The size is always (blockSize + numTaps - 1) */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q7_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c
new file mode 100644
index 0000000..936dca2
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_f32.c
@@ -0,0 +1,573 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_f32.c
+*
+* Description: FIR interpolation for floating-point sequences.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @defgroup FIR_Interpolate Finite Impulse Response (FIR) Interpolator
+ *
+ * These functions combine an upsampler (zero stuffer) and an FIR filter.
+ * They are used in multirate systems for increasing the sample rate of a signal without introducing high frequency images.
+ * Conceptually, the functions are equivalent to the block diagram below:
+ * \image html FIRInterpolator.gif "Components included in the FIR Interpolator functions"
+ * After upsampling by a factor of L, the signal should be filtered by a lowpass filter with a normalized
+ * cutoff frequency of 1/L in order to eliminate high frequency copies of the spectrum.
+ * The user of the function is responsible for providing the filter coefficients.
+ *
+ * The FIR interpolator functions provided in the CMSIS DSP Library combine the upsampler and FIR filter in an efficient manner.
+ * The upsampler inserts L-1 zeros between each sample.
+ * Instead of multiplying by these zero values, the FIR filter is designed to skip them.
+ * This leads to an efficient implementation without any wasted effort.
+ * The functions operate on blocks of input and output data.
+ * pSrc points to an array of blockSize input values and
+ * pDst points to an array of blockSize*L output values.
+ *
+ * The library provides separate functions for Q15, Q31, and floating-point data types.
+ *
+ * \par Algorithm:
+ * The functions use a polyphase filter structure:
+ *
+ * This approach is more efficient than straightforward upsample-then-filter algorithms.
+ * With this method the computation is reduced by a factor of 1/L when compared to using a standard FIR filter.
+ * \par
+ * pCoeffs points to a coefficient array of size numTaps.
+ * numTaps must be a multiple of the interpolation factor L and this is checked by the
+ * initialization functions.
+ * Internally, the function divides the FIR filter's impulse response into shorter filters of length
+ * phaseLength=numTaps/L.
+ * Coefficients are stored in time reversed order.
+ * \par
+ *
+ * The state variables are updated after each block of data is processed, the coefficients are untouched.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable array should be allocated separately.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * - Checks to make sure that the length of the filter is a multiple of the interpolation factor.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ *
+ * arm_fir_interpolate_instance_f32 S = {L, phaseLength, pCoeffs, pState};
+ * arm_fir_interpolate_instance_q31 S = {L, phaseLength, pCoeffs, pState};
+ * arm_fir_interpolate_instance_q15 S = {L, phaseLength, pCoeffs, pState};
+ *
+ * where L is the interpolation factor; phaseLength=numTaps/L is the
+ * length of each of the shorter FIR filters used internally,
+ * pCoeffs is the address of the coefficient buffer;
+ * pState is the address of the state buffer.
+ * Be sure to set the values in the state buffer to zeros when doing static initialization.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR interpolate filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point FIR interpolator.
+ * @param[in] *S points to an instance of the floating-point FIR interpolator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ float32_t sum0; /* Accumulators */
+ float32_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt, j; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+ float32_t acc0, acc1, acc2, acc3;
+ float32_t x1, x2, x3;
+ uint32_t blkCntN4;
+ float32_t c1, c2, c3;
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (phaseLen - 1u);
+
+ /* Initialise blkCnt */
+ blkCnt = blockSize / 4;
+ blkCntN4 = blockSize - (4 * blkCnt);
+
+ /* Samples loop unrolled by 4 */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0.0f;
+ acc1 = 0.0f;
+ acc2 = 0.0f;
+ acc3 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2u;
+
+ x0 = *(ptr1++);
+ x1 = *(ptr1++);
+ x2 = *(ptr1++);
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Read the coefficient */
+ c1 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x1 * c1;
+ acc1 += x2 * c1;
+ acc2 += x3 * c1;
+ acc3 += x0 * c1;
+
+ /* Read the coefficient */
+ c2 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x2 * c2;
+ acc1 += x3 * c2;
+ acc2 += x0 * c2;
+ acc3 += x1 * c2;
+
+ /* Read the coefficient */
+ c3 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x2 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x3 * c3;
+ acc1 += x0 * c3;
+ acc2 += x1 * c3;
+ acc3 += x2 * c3;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x3 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += x0 * c0;
+ acc1 += x1 * c0;
+ acc2 += x2 * c0;
+ acc3 += x3 * c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst = acc0;
+ *(pDst + S->L) = acc1;
+ *(pDst + 2 * S->L) = acc2;
+ *(pDst + 3 * S->L) = acc3;
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 4;
+
+ pDst += S->L * 3;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ while(blkCntN4 > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2u;
+ while(tapCnt > 0u)
+ {
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += x0 * c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum0 += *(ptr1++) * (*ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum0;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCntN4--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (phaseLen - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (phaseLen - 1u) % 0x04u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+}
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+void arm_fir_interpolate_f32(
+ const arm_fir_interpolate_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+
+
+ float32_t sum; /* Accumulator */
+ uint32_t i, blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (phaseLen - 1u);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum = 0.0f;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1u);
+
+ /* Loop over the polyPhase length */
+ tapCnt = phaseLen;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += *ptr1++ * *ptr2;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = sum;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = phaseLen - 1u;
+
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
new file mode 100644
index 0000000..4b93d4b
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_f32.c
@@ -0,0 +1,115 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_init_f32.c
+*
+* Description: Floating-point FIR interpolator initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point FIR interpolator.
+ * @param[in,out] *S points to an instance of the floating-point FIR interpolator structure.
+ * @param[in] L upsample factor.
+ * @param[in] numTaps number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficient buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * the filter length numTaps is not a multiple of the interpolation factor L.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * The length of the filter numTaps must be a multiple of the interpolation factor L.
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length (numTaps/L)+blockSize-1 words
+ * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_f32().
+ */
+
+arm_status arm_fir_interpolate_init_f32(
+ arm_fir_interpolate_instance_f32 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if((numTaps % L) != 0u)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of state array is always phaseLength + blockSize - 1 */
+ memset(pState, 0,
+ (blockSize +
+ ((uint32_t) S->phaseLength - 1u)) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
new file mode 100644
index 0000000..9e8fd34
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q15.c
@@ -0,0 +1,114 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_init_q15.c
+*
+* Description: Q15 FIR interpolator initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q15 FIR interpolator.
+ * @param[in,out] *S points to an instance of the Q15 FIR interpolator structure.
+ * @param[in] L upsample factor.
+ * @param[in] numTaps number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficient buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * the filter length numTaps is not a multiple of the interpolation factor L.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * The length of the filter numTaps must be a multiple of the interpolation factor L.
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length (numTaps/L)+blockSize-1 words
+ * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q15().
+ */
+
+arm_status arm_fir_interpolate_init_q15(
+ arm_fir_interpolate_instance_q15 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if((numTaps % L) != 0u)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0,
+ (blockSize + ((uint32_t) S->phaseLength - 1u)) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
new file mode 100644
index 0000000..3e446ce
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_init_q31.c
@@ -0,0 +1,115 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_init_q31.c
+*
+* Description: Q31 FIR interpolator initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+
+/**
+ * @brief Initialization function for the Q31 FIR interpolator.
+ * @param[in,out] *S points to an instance of the Q31 FIR interpolator structure.
+ * @param[in] L upsample factor.
+ * @param[in] numTaps number of filter coefficients in the filter.
+ * @param[in] *pCoeffs points to the filter coefficient buffer.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return The function returns ARM_MATH_SUCCESS if initialization was successful or ARM_MATH_LENGTH_ERROR if
+ * the filter length numTaps is not a multiple of the interpolation factor L.
+ *
+ * Description:
+ * \par
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * The length of the filter numTaps must be a multiple of the interpolation factor L.
+ * \par
+ * pState points to the array of state variables.
+ * pState is of length (numTaps/L)+blockSize-1 words
+ * where blockSize is the number of input samples processed by each call to arm_fir_interpolate_q31().
+ */
+
+arm_status arm_fir_interpolate_init_q31(
+ arm_fir_interpolate_instance_q31 * S,
+ uint8_t L,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ arm_status status;
+
+ /* The filter length must be a multiple of the interpolation factor */
+ if((numTaps % L) != 0u)
+ {
+ /* Set status as ARM_MATH_LENGTH_ERROR */
+ status = ARM_MATH_LENGTH_ERROR;
+ }
+ else
+ {
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign Interpolation factor */
+ S->L = L;
+
+ /* Assign polyPhaseLength */
+ S->phaseLength = numTaps / L;
+
+ /* Clear state buffer and size of buffer is always phaseLength + blockSize - 1 */
+ memset(pState, 0,
+ (blockSize + ((uint32_t) S->phaseLength - 1u)) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ status = ARM_MATH_SUCCESS;
+ }
+
+ return (status);
+
+}
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c
new file mode 100644
index 0000000..3acfa8e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q15.c
@@ -0,0 +1,502 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_q15.c
+*
+* Description: Q15 FIR interpolation.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 FIR interpolator.
+ * @param[in] *S points to an instance of the Q15 FIR interpolator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_fir_interpolate_q15(
+ const arm_fir_interpolate_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ q63_t sum0; /* Accumulators */
+ q15_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt, j, tapCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+ uint32_t blkCntN2;
+ q63_t acc0, acc1;
+ q15_t x1;
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+
+ /* Initialise blkCnt */
+ blkCnt = blockSize / 2;
+ blkCntN2 = blockSize - (2 * blkCnt);
+
+ /* Samples loop unrolled by 2 */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2u;
+
+ x0 = *(ptr1++);
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x0 *c0;
+
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x0 *c0;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst = (q15_t) (__SSAT((acc0 >> 15), 16));
+ *(pDst + S->L) = (q15_t) (__SSAT((acc1 >> 15), 16));
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 2;
+
+ pDst += S->L;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blkCntN2;
+
+ /* Loop over the blockSize. */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2;
+ while(tapCnt > 0u)
+ {
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen & 0x3u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((sum0 >> 15), 16));
+
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = ((uint32_t) phaseLen - 1u) >> 2u;
+
+ /* copy data */
+ while(i > 0u)
+ {
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+
+#else
+
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ i = ((uint32_t) phaseLen - 1u) % 0x04u;
+
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+}
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+void arm_fir_interpolate_q15(
+ const arm_fir_interpolate_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ q63_t sum; /* Accumulator */
+ q15_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt, tapCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength; /* Length of each polyphase filter component */
+
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (phaseLen - 1u);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1u);
+
+ /* Loop over the polyPhase length */
+ tapCnt = (uint32_t) phaseLen;
+
+ while(tapCnt > 0u)
+ {
+ /* Read the coefficient */
+ c0 = *ptr2;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *ptr1++;
+
+ /* Perform the multiply-accumulate */
+ sum += ((q31_t) x0 * c0);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Store the result after converting to 1.15 format in the destination buffer */
+ *pDst++ = (q15_t) (__SSAT((sum >> 15), 16));
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the start of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ i = (uint32_t) phaseLen - 1u;
+
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c
new file mode 100644
index 0000000..ed67f50
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_interpolate_q31.c
@@ -0,0 +1,498 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_interpolate_q31.c
+*
+* Description: Q31 FIR interpolation.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Interpolate
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q31 FIR interpolator.
+ * @param[in] *S points to an instance of the Q31 FIR interpolator structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clip.
+ * In order to avoid overflows completely the input signal must be scaled down by 1/(numTaps/L).
+ * since numTaps/L additions occur per output sample.
+ * After all multiply-accumulates are performed, the 2.62 accumulator is truncated to 1.32 format and then saturated to 1.31 format.
+ */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_fir_interpolate_q31(
+ const arm_fir_interpolate_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+ q63_t sum0; /* Accumulators */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt, j; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+
+ uint32_t blkCntN2;
+ q63_t acc0, acc1;
+ q31_t x1;
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+
+ /* Initialise blkCnt */
+ blkCnt = blockSize / 2;
+ blkCntN2 = blockSize - (2 * blkCnt);
+
+ /* Samples loop unrolled by 2 */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = (S->L);
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ acc0 = 0;
+ acc1 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2u;
+
+ x0 = *(ptr1++);
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x0 *c0;
+
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L * 2);
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2 + S->L * 3);
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x1 *c0;
+ acc1 += (q63_t) x0 *c0;
+
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += 4 * S->L;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Read the input sample */
+ x1 = *(ptr1++);
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Perform the multiply-accumulate */
+ acc0 += (q63_t) x0 *c0;
+ acc1 += (q63_t) x1 *c0;
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* update states for next sample processing */
+ x0 = x1;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst = (q31_t) (acc0 >> 31);
+ *(pDst + S->L) = (q31_t) (acc1 >> 31);
+
+
+ pDst++;
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 2;
+
+ pDst += S->L;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 2, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blkCntN2;
+
+ /* Loop over the blockSize. */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Address modifier index of coefficient buffer */
+ j = 1u;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum0 = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (S->L - j);
+
+ /* Loop over the polyPhase length. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(4*S->L) coefficients. */
+ tapCnt = phaseLen >> 2;
+ while(tapCnt > 0u)
+ {
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Upsampling is done by stuffing L-1 zeros between each sample.
+ * So instead of multiplying zeros with coefficients,
+ * Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the polyPhase length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = phaseLen & 0x3u;
+
+ while(tapCnt > 0u)
+ {
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *(ptr1++);
+
+ /* Perform the multiply-accumulate */
+ sum0 += (q63_t) x0 *c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (sum0 >> 31);
+
+ /* Increment the address modifier index of coefficient buffer */
+ j++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (phaseLen - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ tapCnt = (phaseLen - 1u) % 0x04u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+
+#else
+
+void arm_fir_interpolate_q31(
+ const arm_fir_interpolate_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *ptr1, *ptr2; /* Temporary pointers for state and coefficient buffers */
+
+ /* Run the below code for Cortex-M0 */
+
+ q63_t sum; /* Accumulator */
+ q31_t x0, c0; /* Temporary variables to hold state and coefficient values */
+ uint32_t i, blkCnt; /* Loop counters */
+ uint16_t phaseLen = S->phaseLength, tapCnt; /* Length of each polyphase filter component */
+
+
+ /* S->pState buffer contains previous frame (phaseLen - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + ((q31_t) phaseLen - 1);
+
+ /* Total number of intput samples */
+ blkCnt = blockSize;
+
+ /* Loop over the blockSize. */
+ while(blkCnt > 0u)
+ {
+ /* Copy new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Loop over the Interpolation factor. */
+ i = S->L;
+
+ while(i > 0u)
+ {
+ /* Set accumulator to zero */
+ sum = 0;
+
+ /* Initialize state pointer */
+ ptr1 = pState;
+
+ /* Initialize coefficient pointer */
+ ptr2 = pCoeffs + (i - 1u);
+
+ tapCnt = phaseLen;
+
+ while(tapCnt > 0u)
+ {
+ /* Read the coefficient */
+ c0 = *(ptr2);
+
+ /* Increment the coefficient pointer by interpolation factor times. */
+ ptr2 += S->L;
+
+ /* Read the input sample */
+ x0 = *ptr1++;
+
+ /* Perform the multiply-accumulate */
+ sum += (q63_t) x0 *c0;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is in the accumulator, store in the destination buffer. */
+ *pDst++ = (q31_t) (sum >> 31);
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 1
+ * to process the next group of interpolation factor number samples */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last phaseLen - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = phaseLen - 1u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /**
+ * @} end of FIR_Interpolate group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c
new file mode 100644
index 0000000..77d0a98
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_f32.c
@@ -0,0 +1,498 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_f32.c
+*
+* Description: Processing function for the floating-point FIR Lattice filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup FIR_Lattice Finite Impulse Response (FIR) Lattice Filters
+ *
+ * This set of functions implements Finite Impulse Response (FIR) lattice filters
+ * for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ * variety of adaptive filter applications. The filter structure is feedforward and
+ * the net impulse response is finite length.
+ * The functions operate on blocks
+ * of input and output data and each call to the function processes
+ * blockSize samples through the filter. pSrc and
+ * pDst point to input and output arrays containing blockSize values.
+ *
+ * \par Algorithm:
+ * \image html FIRLattice.gif "Finite Impulse Response Lattice filter"
+ * The following difference equation is implemented:
+ *
+ * f0[n] = g0[n] = x[n]
+ * fm[n] = fm-1[n] + km * gm-1[n-1] for m = 1, 2, ...M
+ * gm[n] = km * fm-1[n] + gm-1[n-1] for m = 1, 2, ...M
+ * y[n] = fM[n]
+ *
+ * \par
+ * pCoeffs points to tha array of reflection coefficients of size numStages.
+ * Reflection Coefficients are stored in the following order.
+ * \par
+ *
+ * {k1, k2, ..., kM}
+ *
+ * where M is number of stages
+ * \par
+ * pState points to a state array of size numStages.
+ * The state variables (g values) hold previous inputs and are stored in the following order.
+ *
+ * {g0[n], g1[n], g2[n] ...gM-1[n]}
+ *
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+ *
+ *arm_fir_lattice_instance_f32 S = {numStages, pState, pCoeffs};
+ *arm_fir_lattice_instance_q31 S = {numStages, pState, pCoeffs};
+ *arm_fir_lattice_instance_q15 S = {numStages, pState, pCoeffs};
+ *
+ * \par
+ * where numStages is the number of stages in the filter; pState is the address of the state buffer;
+ * pCoeffs is the address of the coefficient buffer.
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the FIR Lattice filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+
+ /**
+ * @brief Processing function for the floating-point FIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point FIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_fir_lattice_f32(
+ const arm_fir_lattice_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* temporary state pointer */
+ float32_t *pk; /* temporary coefficient pointer */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t fcurr1, fnext1, gcurr1, gnext1; /* temporary variables for first sample in loop unrolling */
+ float32_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
+ float32_t fcurr3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
+ float32_t fcurr4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ gcurr1 = 0.0f;
+ pState = &S->pState[0];
+
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+ fcurr2 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Read g0(n-1) from state */
+ gcurr1 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (fcurr1 * (*pk)) + gcurr1;
+
+ /* Process second sample for first tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ((*pk) * fcurr1);
+ gnext2 = (fcurr2 * (*pk)) + fcurr1;
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurr3 = *pSrc++;
+ fcurr4 = *pSrc++;
+
+ /* Copy only last input samples into the state buffer
+ which will be used for next four samples processing */
+ *px++ = fcurr4;
+
+ /* Process third sample for first tap */
+ fnext3 = fcurr3 + ((*pk) * fcurr2);
+ gnext3 = (fcurr3 * (*pk)) + fcurr2;
+
+ /* Process fourth sample for first tap */
+ fnext4 = fcurr4 + ((*pk) * fcurr3);
+ gnext4 = (fcurr4 * (*pk++)) + fcurr3;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+ fcurr4 = fnext4;
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1u) >> 2u;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while(stageCnt > 0u)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ((*pk) * gnext1);
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext3 = fcurr3 + ((*pk) * gnext2);
+ /* Process fourth sample for 2nd, 6th .. tap */
+ fnext4 = fcurr4 + ((*pk) * gnext3);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g2(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurr1 = fnext1 + ((*pk) * gcurr1);
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurr2 = fnext2 + ((*pk) * gnext1);
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurr3 = fnext3 + ((*pk) * gnext2);
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurr4 = fnext4 + ((*pk) * gnext3);
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext4 = (fnext4 * (*pk)) + gnext3;
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+ gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g3(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext2 = fcurr2 + ((*pk) * gnext1);
+ /* Process third sample for 4th, 8th .. tap */
+ fnext3 = fcurr3 + ((*pk) * gnext2);
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext4 = fcurr4 + ((*pk) * gnext3);
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurr1 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = gnext4;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurr1 = fnext1 + ((*pk) * gcurr1);
+ /* Process second sample for 5th, 9th .. tap */
+ fcurr2 = fnext2 + ((*pk) * gnext1);
+ /* Process third sample for 5th, 9th .. tap */
+ fcurr3 = fnext3 + ((*pk) * gnext2);
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurr4 = fnext4 + ((*pk) * gnext3);
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext4 = (fnext4 * (*pk)) + gnext3;
+ gnext3 = (fnext3 * (*pk)) + gnext2;
+ gnext2 = (fnext2 * (*pk)) + gnext1;
+ gnext1 = (fnext1 * (*pk++)) + gcurr1;
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1u) % 0x4u;
+
+ while(stageCnt > 0u)
+ {
+ gcurr1 = *px;
+
+ /* save g value in state buffer */
+ *px++ = gnext4;
+
+ /* Process four samples for last three taps here */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ fnext2 = fcurr2 + ((*pk) * gnext1);
+ fnext3 = fcurr3 + ((*pk) * gnext2);
+ fnext4 = fcurr4 + ((*pk) * gnext3);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext4 = (fcurr4 * (*pk)) + gnext3;
+ gnext3 = (fcurr3 * (*pk)) + gnext2;
+ gnext2 = (fcurr2 * (*pk)) + gnext1;
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* Update of f values for next coefficient set processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+ fcurr3 = fnext3;
+ fcurr4 = fnext4;
+
+ stageCnt--;
+
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+ *pDst++ = fcurr2;
+ *pDst++ = fcurr3;
+ *pDst++ = fcurr4;
+
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr1;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext1;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = fcurr1 + ((*pk) * gcurr1);
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext1 = (fcurr1 * (*pk++)) + gcurr1;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+
+ blkCnt--;
+
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t fcurr, fnext, gcurr, gnext; /* temporary variables */
+ uint32_t numStages = S->numStages; /* Length of the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ pState = &S->pState[0];
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = pCoeffs;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g0(n-1) from state buffer */
+ gcurr = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext = fcurr + ((*pk) * gcurr);
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext = (fcurr * (*pk++)) + gcurr;
+
+ /* save f0(n) in state buffer */
+ *px++ = fcurr;
+
+ /* f1(n) is saved in fcurr
+ for next stage processing */
+ fcurr = fnext;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* read g2(n) from state buffer */
+ gcurr = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext = fcurr + ((*pk) * gcurr);
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext = (fcurr * (*pk++)) + gcurr;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr = fnext;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr;
+
+ blkCnt--;
+
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
new file mode 100644
index 0000000..c6f4d7e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_f32.c
@@ -0,0 +1,77 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_init_f32.c
+*
+* Description: Floating-point FIR Lattice filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point FIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point FIR lattice structure.
+ * @param[in] numStages number of filter stages.
+ * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
+ * @param[in] *pState points to the state buffer. The array is of length numStages.
+ * @return none.
+ */
+
+void arm_fir_lattice_init_f32(
+ arm_fir_lattice_instance_f32 * S,
+ uint16_t numStages,
+ float32_t * pCoeffs,
+ float32_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
new file mode 100644
index 0000000..f79a082
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q15.c
@@ -0,0 +1,77 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_init_q15.c
+*
+* Description: Q15 FIR Lattice filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for the Q15 FIR lattice filter.
+ * @param[in] *S points to an instance of the Q15 FIR lattice structure.
+ * @param[in] numStages number of filter stages.
+ * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
+ * @param[in] *pState points to the state buffer. The array is of length numStages.
+ * @return none.
+ */
+
+void arm_fir_lattice_init_q15(
+ arm_fir_lattice_instance_q15 * S,
+ uint16_t numStages,
+ q15_t * pCoeffs,
+ q15_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
new file mode 100644
index 0000000..89116e3
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_init_q31.c
@@ -0,0 +1,77 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_init_q31.c
+*
+* Description: Q31 FIR lattice filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for the Q31 FIR lattice filter.
+ * @param[in] *S points to an instance of the Q31 FIR lattice structure.
+ * @param[in] numStages number of filter stages.
+ * @param[in] *pCoeffs points to the coefficient buffer. The array is of length numStages.
+ * @param[in] *pState points to the state buffer. The array is of length numStages.
+ * @return none.
+ */
+
+void arm_fir_lattice_init_q31(
+ arm_fir_lattice_instance_q31 * S,
+ uint16_t numStages,
+ q31_t * pCoeffs,
+ q31_t * pState)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always numStages */
+ memset(pState, 0, (numStages) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c
new file mode 100644
index 0000000..8693c37
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q15.c
@@ -0,0 +1,530 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_q15.c
+*
+* Description: Q15 FIR lattice filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+
+/**
+ * @brief Processing function for the Q15 FIR lattice filter.
+ * @param[in] *S points to an instance of the Q15 FIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_fir_lattice_q15(
+ const arm_fir_lattice_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* temporary state pointer */
+ q15_t *pk; /* temporary coefficient pointer */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t fcurnt1, fnext1, gcurnt1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */
+ q31_t fcurnt2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
+ q31_t fcurnt3, fnext3, gnext3; /* temporary variables for third sample in loop unrolling */
+ q31_t fcurnt4, fnext4, gnext4; /* temporary variables for fourth sample in loop unrolling */
+ uint32_t numStages = S->numStages; /* Number of stages in the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ pState = &S->pState[0];
+
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+
+ /* Read two samples from input buffer */
+ /* f0(n) = x(n) */
+ fcurnt1 = *pSrc++;
+ fcurnt2 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Read g0(n-1) from state */
+ gcurnt1 = *px;
+
+ /* Process first sample for first tap */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (q31_t) ((fcurnt1 * (*pk)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+ /* Process second sample for first tap */
+ /* for sample 2 processing */
+ fnext2 = (q31_t) ((fcurnt1 * (*pk)) >> 15u) + fcurnt2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + fcurnt1;
+ gnext2 = __SSAT(gnext2, 16);
+
+
+ /* Read next two samples from input buffer */
+ /* f0(n+2) = x(n+2) */
+ fcurnt3 = *pSrc++;
+ fcurnt4 = *pSrc++;
+
+ /* Copy only last input samples into the state buffer
+ which is used for next four samples processing */
+ *px++ = (q15_t) fcurnt4;
+
+ /* Process third sample for first tap */
+ fnext3 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + fcurnt3;
+ fnext3 = __SSAT(fnext3, 16);
+ gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + fcurnt2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ /* Process fourth sample for first tap */
+ fnext4 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + fcurnt4;
+ fnext4 = __SSAT(fnext4, 16);
+ gnext4 = (q31_t) ((fcurnt4 * (*pk++)) >> 15u) + fcurnt3;
+ gnext4 = __SSAT(gnext4, 16);
+
+ /* Update of f values for next coefficient set processing */
+ fcurnt1 = fnext1;
+ fcurnt2 = fnext2;
+ fcurnt3 = fnext3;
+ fcurnt4 = fnext4;
+
+
+ /* Loop unrolling. Process 4 taps at a time . */
+ stageCnt = (numStages - 1u) >> 2;
+
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numStages-3 coefficients. */
+
+ /* Process 2nd, 3rd, 4th and 5th taps ... here */
+ while(stageCnt > 0u)
+ {
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurnt1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext4;
+
+ /* Process first sample for 2nd, 6th .. tap */
+ /* Sample processing for K2, K6.... */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+
+
+ /* Process second sample for 2nd, 6th .. tap */
+ /* for sample 2 processing */
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2;
+ fnext2 = __SSAT(fnext2, 16);
+ /* Process third sample for 2nd, 6th .. tap */
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3;
+ fnext3 = __SSAT(fnext3, 16);
+ /* Process fourth sample for 2nd, 6th .. tap */
+ /* fnext4 = fcurnt4 + (*pk) * gnext3; */
+ fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4;
+ fnext4 = __SSAT(fnext4, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3;
+ gnext4 = __SSAT(gnext4, 16);
+ gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurnt1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext4;
+
+ /* Sample processing for K3, K7.... */
+ /* Process first sample for 3rd, 7th .. tap */
+ /* f3(n) = f2(n) + K3 * g2(n-1) */
+ fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fnext1;
+ fcurnt1 = __SSAT(fcurnt1, 16);
+
+ /* Process second sample for 3rd, 7th .. tap */
+ fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fnext2;
+ fcurnt2 = __SSAT(fcurnt2, 16);
+
+ /* Process third sample for 3rd, 7th .. tap */
+ fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fnext3;
+ fcurnt3 = __SSAT(fcurnt3, 16);
+
+ /* Process fourth sample for 3rd, 7th .. tap */
+ fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fnext4;
+ fcurnt4 = __SSAT(fcurnt4, 16);
+
+ /* Calculation of state values for next stage */
+ /* g3(n) = f2(n) * K3 + g2(n-1) */
+ gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15u) + gnext3;
+ gnext4 = __SSAT(gnext4, 16);
+
+ gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15u) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15u) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+
+ gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+ /* Read g1(n-1), g3(n-1) .... from state */
+ gcurnt1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext4;
+
+ /* Sample processing for K4, K8.... */
+ /* Process first sample for 4th, 8th .. tap */
+ /* f4(n) = f3(n) + K4 * g3(n-1) */
+ fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ /* Process second sample for 4th, 8th .. tap */
+ /* for sample 2 processing */
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ /* Process third sample for 4th, 8th .. tap */
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3;
+ fnext3 = __SSAT(fnext3, 16);
+
+ /* Process fourth sample for 4th, 8th .. tap */
+ fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4;
+ fnext4 = __SSAT(fnext4, 16);
+
+ /* g4(n) = f3(n) * K4 + g3(n-1) */
+ /* Calculation of state values for next stage */
+ gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3;
+ gnext4 = __SSAT(gnext4, 16);
+
+ gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+
+ gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+ gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+
+ /* Read g2(n-1), g4(n-1) .... from state */
+ gcurnt1 = *px;
+
+ /* save g4(n) in state buffer */
+ *px++ = (q15_t) gnext4;
+
+ /* Sample processing for K5, K9.... */
+ /* Process first sample for 5th, 9th .. tap */
+ /* f5(n) = f4(n) + K5 * g4(n-1) */
+ fcurnt1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fnext1;
+ fcurnt1 = __SSAT(fcurnt1, 16);
+
+ /* Process second sample for 5th, 9th .. tap */
+ fcurnt2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fnext2;
+ fcurnt2 = __SSAT(fcurnt2, 16);
+
+ /* Process third sample for 5th, 9th .. tap */
+ fcurnt3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fnext3;
+ fcurnt3 = __SSAT(fcurnt3, 16);
+
+ /* Process fourth sample for 5th, 9th .. tap */
+ fcurnt4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fnext4;
+ fcurnt4 = __SSAT(fcurnt4, 16);
+
+ /* Calculation of state values for next stage */
+ /* g5(n) = f4(n) * K5 + g4(n-1) */
+ gnext4 = (q31_t) ((fnext4 * (*pk)) >> 15u) + gnext3;
+ gnext4 = __SSAT(gnext4, 16);
+ gnext3 = (q31_t) ((fnext3 * (*pk)) >> 15u) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+ gnext2 = (q31_t) ((fnext2 * (*pk)) >> 15u) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+ gnext1 = (q31_t) ((fnext1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+ stageCnt--;
+ }
+
+ /* If the (filter length -1) is not a multiple of 4, compute the remaining filter taps */
+ stageCnt = (numStages - 1u) % 0x4u;
+
+ while(stageCnt > 0u)
+ {
+ gcurnt1 = *px;
+
+ /* save g value in state buffer */
+ *px++ = (q15_t) gnext4;
+
+ /* Process four samples for last three taps here */
+ fnext1 = (q31_t) ((gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+ fnext2 = (q31_t) ((gnext1 * (*pk)) >> 15u) + fcurnt2;
+ fnext2 = __SSAT(fnext2, 16);
+
+ fnext3 = (q31_t) ((gnext2 * (*pk)) >> 15u) + fcurnt3;
+ fnext3 = __SSAT(fnext3, 16);
+
+ fnext4 = (q31_t) ((gnext3 * (*pk)) >> 15u) + fcurnt4;
+ fnext4 = __SSAT(fnext4, 16);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext4 = (q31_t) ((fcurnt4 * (*pk)) >> 15u) + gnext3;
+ gnext4 = __SSAT(gnext4, 16);
+ gnext3 = (q31_t) ((fcurnt3 * (*pk)) >> 15u) + gnext2;
+ gnext3 = __SSAT(gnext3, 16);
+ gnext2 = (q31_t) ((fcurnt2 * (*pk)) >> 15u) + gnext1;
+ gnext2 = __SSAT(gnext2, 16);
+ gnext1 = (q31_t) ((fcurnt1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+ /* Update of f values for next coefficient set processing */
+ fcurnt1 = fnext1;
+ fcurnt2 = fnext2;
+ fcurnt3 = fnext3;
+ fcurnt4 = fnext4;
+
+ stageCnt--;
+
+ }
+
+ /* The results in the 4 accumulators, store in the destination buffer. */
+ /* y(n) = fN(n) */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pDst)++ = __PKHBT(fcurnt1, fcurnt2, 16);
+ *__SIMD32(pDst)++ = __PKHBT(fcurnt3, fcurnt4, 16);
+
+#else
+
+ *__SIMD32(pDst)++ = __PKHBT(fcurnt2, fcurnt1, 16);
+ *__SIMD32(pDst)++ = __PKHBT(fcurnt4, fcurnt3, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurnt1 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g2(n) from state buffer */
+ gcurnt1 = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) fcurnt1;
+
+ /* f1(n) is saved in fcurnt1
+ for next stage processing */
+ fcurnt1 = fnext1;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* read g2(n) from state buffer */
+ gcurnt1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = (q15_t) gnext1;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = (((q31_t) gcurnt1 * (*pk)) >> 15u) + fcurnt1;
+ fnext1 = __SSAT(fnext1, 16);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext1 = (((q31_t) fcurnt1 * (*pk++)) >> 15u) + gcurnt1;
+ gnext1 = __SSAT(gnext1, 16);
+
+
+ /* f1(n) is saved in fcurnt1
+ for next stage processing */
+ fcurnt1 = fnext1;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = __SSAT(fcurnt1, 16);
+
+
+ blkCnt--;
+
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t fcurnt, fnext, gcurnt, gnext; /* temporary variables */
+ uint32_t numStages = S->numStages; /* Length of the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ pState = &S->pState[0];
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurnt = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g0(n-1) from state buffer */
+ gcurnt = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext = ((gcurnt * (*pk)) >> 15u) + fcurnt;
+ fnext = __SSAT(fnext, 16);
+
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext = ((fcurnt * (*pk++)) >> 15u) + gcurnt;
+ gnext = __SSAT(gnext, 16);
+
+ /* save f0(n) in state buffer */
+ *px++ = (q15_t) fcurnt;
+
+ /* f1(n) is saved in fcurnt
+ for next stage processing */
+ fcurnt = fnext;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* read g1(n-1) from state buffer */
+ gcurnt = *px;
+
+ /* save g0(n-1) in state buffer */
+ *px++ = (q15_t) gnext;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext = ((gcurnt * (*pk)) >> 15u) + fcurnt;
+ fnext = __SSAT(fnext, 16);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext = ((fcurnt * (*pk++)) >> 15u) + gcurnt;
+ gnext = __SSAT(gnext, 16);
+
+
+ /* f1(n) is saved in fcurnt
+ for next stage processing */
+ fcurnt = fnext;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = __SSAT(fcurnt, 16);
+
+
+ blkCnt--;
+
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c
new file mode 100644
index 0000000..7bd8c19
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_lattice_q31.c
@@ -0,0 +1,347 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_lattice_q31.c
+*
+* Description: Q31 FIR lattice filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Lattice
+ * @{
+ */
+
+
+/**
+ * @brief Processing function for the Q31 FIR lattice filter.
+ * @param[in] *S points to an instance of the Q31 FIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * In order to avoid overflows the input signal must be scaled down by 2*log2(numStages) bits.
+ */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_fir_lattice_q31(
+ const arm_fir_lattice_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* temporary state pointer */
+ q31_t *pk; /* temporary coefficient pointer */
+ q31_t fcurr1, fnext1, gcurr1 = 0, gnext1; /* temporary variables for first sample in loop unrolling */
+ q31_t fcurr2, fnext2, gnext2; /* temporary variables for second sample in loop unrolling */
+ uint32_t numStages = S->numStages; /* Length of the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+ q31_t k;
+
+ pState = &S->pState[0];
+
+ blkCnt = blockSize >> 1u;
+
+ /* First part of the processing with loop unrolling. Compute 2 outputs at a time.
+ a second loop below computes the remaining 1 sample. */
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+
+ /* f0(n) = x(n) */
+ fcurr2 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g0(n - 1) from state buffer */
+ gcurr1 = *px;
+
+ /* Read the reflection coefficient */
+ k = *pk++;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
+ fnext1 = fcurr1 + (fnext1 << 1u);
+ gnext1 = gcurr1 + (gnext1 << 1u);
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext2 = (q31_t) (((q63_t) fcurr1 * k) >> 32);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32);
+ fnext2 = fcurr2 + (fnext2 << 1u);
+ gnext2 = fcurr1 + (gnext2 << 1u);
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr2;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+
+ /* Read the reflection coefficient */
+ k = *pk++;
+
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext2;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
+ fnext2 = (q31_t) (((q63_t) gnext1 * k) >> 32);
+
+ fnext1 = fcurr1 + (fnext1 << 1u);
+ fnext2 = fcurr2 + (fnext2 << 1u);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext2 = (q31_t) (((q63_t) fcurr2 * (k)) >> 32);
+ gnext2 = gnext1 + (gnext2 << 1u);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
+ gnext1 = gcurr1 + (gnext1 << 1u);
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+ fcurr2 = fnext2;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+ *pDst++ = fcurr2;
+
+ blkCnt--;
+
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x2u;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr1 = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g0(n - 1) from state buffer */
+ gcurr1 = *px;
+
+ /* Read the reflection coefficient */
+ k = *pk++;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
+ fnext1 = fcurr1 + (fnext1 << 1u);
+
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
+ gnext1 = gcurr1 + (gnext1 << 1u);
+
+ /* save g1(n) in state buffer */
+ *px++ = fcurr1;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* Read the reflection coefficient */
+ k = *pk++;
+
+ /* read g2(n) from state buffer */
+ gcurr1 = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext1;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext1 = (q31_t) (((q63_t) gcurr1 * k) >> 32);
+ fnext1 = fcurr1 + (fnext1 << 1u);
+
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext1 = (q31_t) (((q63_t) fcurr1 * (k)) >> 32);
+ gnext1 = gcurr1 + (gnext1 << 1u);
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr1 = fnext1;
+
+ stageCnt--;
+
+ }
+
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr1;
+
+ blkCnt--;
+
+ }
+
+
+}
+
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+void arm_fir_lattice_q31(
+ const arm_fir_lattice_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* temporary state pointer */
+ q31_t *pk; /* temporary coefficient pointer */
+ q31_t fcurr, fnext, gcurr, gnext; /* temporary variables */
+ uint32_t numStages = S->numStages; /* Length of the filter */
+ uint32_t blkCnt, stageCnt; /* temporary variables for counts */
+
+ pState = &S->pState[0];
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* f0(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize coeff pointer */
+ pk = (pCoeffs);
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* read g0(n-1) from state buffer */
+ gcurr = *px;
+
+ /* for sample 1 processing */
+ /* f1(n) = f0(n) + K1 * g0(n-1) */
+ fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+ /* g1(n) = f0(n) * K1 + g0(n-1) */
+ gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+ /* save g1(n) in state buffer */
+ *px++ = fcurr;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr = fnext;
+
+ stageCnt = (numStages - 1u);
+
+ /* stage loop */
+ while(stageCnt > 0u)
+ {
+ /* read g2(n) from state buffer */
+ gcurr = *px;
+
+ /* save g1(n) in state buffer */
+ *px++ = gnext;
+
+ /* Sample processing for K2, K3.... */
+ /* f2(n) = f1(n) + K2 * g1(n-1) */
+ fnext = (q31_t) (((q63_t) gcurr * (*pk)) >> 31) + fcurr;
+ /* g2(n) = f1(n) * K2 + g1(n-1) */
+ gnext = (q31_t) (((q63_t) fcurr * (*pk++)) >> 31) + gcurr;
+
+ /* f1(n) is saved in fcurr1
+ for next stage processing */
+ fcurr = fnext;
+
+ stageCnt--;
+
+ }
+
+ /* y(n) = fN(n) */
+ *pDst++ = fcurr;
+
+ blkCnt--;
+
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+/**
+ * @} end of FIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c
new file mode 100644
index 0000000..1e5873a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q15.c
@@ -0,0 +1,688 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_q15.c
+*
+* Description: Q15 FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 FIR filter.
+ * @param[in] *S points to an instance of the Q15 FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ *
+ * \par Restrictions
+ * If the silicon does not support unaligned memory access enable the macro UNALIGNED_SUPPORT_DISABLE
+ * In this case input, output, state buffers should be aligned by 32-bit
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ *
+ * \par
+ * Refer to the function arm_fir_fast_q15() for a faster but less precise implementation of this function.
+ */
+
+#ifndef ARM_MATH_CM0
+
+/* Run the below code for Cortex-M4 and Cortex-M3 */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+
+void arm_fir_q15(
+ const arm_fir_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px1; /* Temporary q15 pointer for state buffer */
+ q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t x0, x1, x2, x3, c0; /* Temporary variables to hold SIMD state and coefficient values */
+ q63_t acc0, acc1, acc2, acc3; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer.
+ ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pSrc)++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer of type q15 */
+ px1 = pState;
+
+ /* Initialize coeff pointer of type q31 */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
+ x0 = _SIMD32_OFFSET(px1);
+
+ /* Read the third and forth samples from the state buffer: x[n-N-1], x[n-N-2] */
+ x1 = _SIMD32_OFFSET(px1 + 1u);
+
+ px1 += 2u;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-4 coefficients. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+ acc1 = __SMLALD(x1, c0, acc1);
+
+ /* Read state x[n-N-2], x[n-N-3] */
+ x2 = _SIMD32_OFFSET(px1);
+
+ /* Read state x[n-N-3], x[n-N-4] */
+ x3 = _SIMD32_OFFSET(px1 + 1u);
+
+ /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+ acc3 = __SMLALD(x3, c0, acc3);
+
+ /* Read coefficients b[N-2], b[N-3] */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+ acc0 = __SMLALD(x2, c0, acc0);
+
+ /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+ acc1 = __SMLALD(x3, c0, acc1);
+
+ /* Read state x[n-N-4], x[n-N-5] */
+ x0 = _SIMD32_OFFSET(px1 + 2u);
+
+ /* Read state x[n-N-5], x[n-N-6] */
+ x1 = _SIMD32_OFFSET(px1 + 3u);
+
+ /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+ acc2 = __SMLALD(x0, c0, acc2);
+
+ /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+ acc3 = __SMLALD(x1, c0, acc3);
+
+ px1 += 4u;
+
+ tapCnt--;
+
+ }
+
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+ ** This is always be 2 taps since the filter length is even. */
+ if((numTaps & 0x3u) != 0u)
+ {
+ /* Read 2 coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* Fetch 4 state variables */
+ x2 = _SIMD32_OFFSET(px1);
+
+ x3 = _SIMD32_OFFSET(px1 + 1u);
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ px1 += 2u;
+
+ acc1 = __SMLALD(x1, c0, acc1);
+ acc2 = __SMLALD(x2, c0, acc2);
+ acc3 = __SMLALD(x3, c0, acc3);
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the 4 outputs in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+ while(blkCnt > 0u)
+ {
+ /* Copy two samples into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer of type q15 */
+ px1 = pState;
+
+ /* Initialize coeff pointer of type q31 */
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 1;
+
+ do
+ {
+
+ c0 = *__SIMD32(pb)++;
+ x0 = *__SIMD32(px1)++;
+
+ acc0 = __SMLALD(x0, c0, acc0);
+ tapCnt--;
+ }
+ while(tapCnt > 0u);
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Calculation of count for copying integer writes */
+ tapCnt = (numTaps - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Copy state values to start of state buffer */
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+
+ tapCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* copy remaining data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+}
+
+#else /* UNALIGNED_SUPPORT_DISABLE */
+
+void arm_fir_q15(
+ const arm_fir_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q63_t acc0, acc1, acc2, acc3; /* Accumulators */
+ q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q15_t *px; /* Temporary q31 pointer for SIMD state buffer accesses */
+ q31_t x0, x1, x2, c0; /* Temporary variables to hold SIMD state and coefficient values */
+ uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer.
+ ** Use 32-bit SIMD to move the 16-bit data. Only requires two copies. */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Typecast q15_t pointer to q31_t pointer for state reading in q31_t */
+ px = pState;
+
+ /* Typecast q15_t pointer to q31_t pointer for coefficient reading in q31_t */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer: x[n-N], x[n-N-1] */
+ x0 = *__SIMD32(px)++;
+
+ /* Read the third and forth samples from the state buffer: x[n-N-2], x[n-N-3] */
+ x2 = *__SIMD32(px)++;
+
+ /* Loop over the number of taps. Unroll by a factor of 4.
+ ** Repeat until we've computed numTaps-(numTaps%4) coefficients. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0)
+ {
+ /* Read the first two coefficients using SIMD: b[N] and b[N-1] coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ acc0 = __SMLALD(x0, c0, acc0);
+
+ /* acc2 += b[N] * x[n-N-2] + b[N-1] * x[n-N-3] */
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* pack x[n-N-1] and x[n-N-2] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read state x[n-N-4], x[n-N-5] */
+ x0 = _SIMD32_OFFSET(px);
+
+ /* acc1 += b[N] * x[n-N-1] + b[N-1] * x[n-N-2] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack x[n-N-3] and x[n-N-4] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* acc3 += b[N] * x[n-N-3] + b[N-1] * x[n-N-4] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ /* Read coefficients b[N-2], b[N-3] */
+ c0 = *__SIMD32(pb)++;
+
+ /* acc0 += b[N-2] * x[n-N-2] + b[N-3] * x[n-N-3] */
+ acc0 = __SMLALD(x2, c0, acc0);
+
+ /* Read state x[n-N-6], x[n-N-7] with offset */
+ x2 = _SIMD32_OFFSET(px + 2u);
+
+ /* acc2 += b[N-2] * x[n-N-4] + b[N-3] * x[n-N-5] */
+ acc2 = __SMLALD(x0, c0, acc2);
+
+ /* acc1 += b[N-2] * x[n-N-3] + b[N-3] * x[n-N-4] */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack x[n-N-5] and x[n-N-6] */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* acc3 += b[N-2] * x[n-N-5] + b[N-3] * x[n-N-6] */
+ acc3 = __SMLALDX(x1, c0, acc3);
+
+ /* Update state pointer for next state reading */
+ px += 4u;
+
+ /* Decrement tap count */
+ tapCnt--;
+
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps.
+ ** This is always be 2 taps since the filter length is even. */
+ if((numTaps & 0x3u) != 0u)
+ {
+
+ /* Read last two coefficients */
+ c0 = *__SIMD32(pb)++;
+
+ /* Perform the multiply-accumulates */
+ acc0 = __SMLALD(x0, c0, acc0);
+ acc2 = __SMLALD(x2, c0, acc2);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x2, x0, 0);
+#else
+ x1 = __PKHBT(x0, x2, 0);
+#endif
+
+ /* Read last state variables */
+ x0 = *__SIMD32(px);
+
+ /* Perform the multiply-accumulates */
+ acc1 = __SMLALDX(x1, c0, acc1);
+
+ /* pack state variables */
+#ifndef ARM_MATH_BIG_ENDIAN
+ x1 = __PKHBT(x0, x2, 0);
+#else
+ x1 = __PKHBT(x2, x0, 0);
+#endif
+
+ /* Perform the multiply-accumulates */
+ acc3 = __SMLALDX(x1, c0, acc3);
+ }
+
+ /* The results in the 4 accumulators are in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the 4 outputs in the destination buffer. */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc0 >> 15), 16), __SSAT((acc1 >> 15), 16), 16);
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc2 >> 15), 16), __SSAT((acc3 >> 15), 16), 16);
+
+#else
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc1 >> 15), 16), __SSAT((acc0 >> 15), 16), 16);
+
+ *__SIMD32(pDst)++ =
+ __PKHBT(__SSAT((acc3 >> 15), 16), __SSAT((acc2 >> 15), 16), 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+ while(blkCnt > 0u)
+ {
+ /* Copy two samples into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Use SIMD to hold states and coefficients */
+ px = pState;
+ pb = pCoeffs;
+
+ tapCnt = numTaps >> 1u;
+
+ do
+ {
+ acc0 += (q31_t) * px++ * *pb++;
+ acc0 += (q31_t) * px++ * *pb++;
+ tapCnt--;
+ }
+ while(tapCnt > 0u);
+
+ /* The result is in 2.30 format. Convert to 1.15 with saturation.
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) (__SSAT((acc0 >> 15), 16));
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Calculation of count for copying integer writes */
+ tapCnt = (numTaps - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ tapCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* copy remaining data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+}
+
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#else /* ARM_MATH_CM0 */
+
+
+/* Run the below code for Cortex-M0 */
+
+void arm_fir_q15(
+ const arm_fir_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+
+
+
+ q15_t *px; /* Temporary pointer for state buffer */
+ q15_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Number of nTaps in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Initialize blkCnt with blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ tapCnt = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc += (q31_t) * px++ * *pb++;
+ tapCnt--;
+ } while(tapCnt > 0u);
+
+ /* The result is in 2.30 format. Convert to 1.15
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q15_t) __SSAT((acc >> 15u), 16);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy numTaps number of values */
+ tapCnt = (numTaps - 1u);
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c
new file mode 100644
index 0000000..77ac680
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q31.c
@@ -0,0 +1,362 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_q31.c
+*
+* Description: Q31 FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @param[in] *S points to an instance of the Q31 FIR filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clip.
+ * In order to avoid overflows completely the input signal must be scaled down by log2(numTaps) bits.
+ * After all multiply-accumulates are performed, the 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ *
+ * \par
+ * Refer to the function arm_fir_fast_q31() for a faster but less precise implementation of this filter for Cortex-M3 and Cortex-M4.
+ */
+
+void arm_fir_q31(
+ const arm_fir_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t x0, x1, x2; /* Temporary variables to hold state */
+ q31_t c0; /* Temporary variable to hold coefficient value */
+ q31_t *px; /* Temporary pointer for state */
+ q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc0, acc1, acc2; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt, tapCntN3; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize / 3;
+ blockSize = blockSize - (3 * blkCnt);
+
+ tapCnt = numTaps / 3;
+ tapCntN3 = numTaps - (3 * tapCnt);
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy three new input samples into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first two samples from the state buffer:
+ * x[n-numTaps], x[n-numTaps-1] */
+ x0 = *(px++);
+ x1 = *(px++);
+
+ /* Loop unrolling. Process 3 taps at a time. */
+ i = tapCnt;
+
+ while(i > 0u)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *pb;
+
+ /* Read x[n-numTaps-2] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x0 * c0);
+ acc1 += ((q63_t) x1 * c0);
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Read the coefficient and state */
+ c0 = *(pb + 1u);
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x1 * c0);
+ acc1 += ((q63_t) x2 * c0);
+ acc2 += ((q63_t) x0 * c0);
+
+ /* Read the coefficient and state */
+ c0 = *(pb + 2u);
+ x1 = *(px++);
+
+ /* update coefficient pointer */
+ pb += 3u;
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x2 * c0);
+ acc1 += ((q63_t) x0 * c0);
+ acc2 += ((q63_t) x1 * c0);
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* If the filter length is not a multiple of 3, compute the remaining filter taps */
+
+ i = tapCntN3;
+
+ while(i > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q63_t) x0 * c0);
+ acc1 += ((q63_t) x1 * c0);
+ acc2 += ((q63_t) x2 * c0);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 3 to process the next group of 3 samples */
+ pState = pState + 3;
+
+ /* The results in the 3 accumulators are in 2.30 format. Convert to 1.31
+ ** Then store the 3 outputs in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31u);
+ *pDst++ = (q31_t) (acc1 >> 31u);
+ *pDst++ = (q31_t) (acc2 >> 31u);
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 3, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+
+ while(blockSize > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = (pCoeffs);
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ acc0 += (q63_t) * (px++) * (*(pb++));
+ i--;
+ } while(i > 0u);
+
+ /* The result is in 2.62 format. Convert to 1.31
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q31_t) (acc0 >> 31u);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the samples loop counter */
+ blockSize--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ q31_t *px; /* Temporary pointer for state */
+ q31_t *pb; /* Temporary pointer for coefficient buffer */
+ q63_t acc; /* Accumulator */
+ uint32_t numTaps = S->numTaps; /* Length of the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState buffer contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Initialize blkCnt with blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = pCoeffs;
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc += (q63_t) * px++ * *pb++;
+ i--;
+ } while(i > 0u);
+
+ /* The result is in 2.62 format. Convert to 1.31
+ ** Then store the output in the destination buffer. */
+ *pDst++ = (q31_t) (acc >> 31u);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the starting of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy numTaps number of values */
+ tapCnt = numTaps - 1u;
+
+ /* Copy the data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c
new file mode 100644
index 0000000..ba99e61
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_q7.c
@@ -0,0 +1,387 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_q7.c
+*
+* Description: Q7 FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR
+ * @{
+ */
+
+/**
+ * @param[in] *S points to an instance of the Q7 FIR filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * The accumulator is converted to 18.7 format by discarding the low 7 bits.
+ * Finally, the result is truncated to 1.7 format.
+ */
+
+void arm_fir_q7(
+ const arm_fir_instance_q7 * S,
+ q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t *pState = S->pState; /* State pointer */
+ q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *pStateCurnt; /* Points to the current sample of the state */
+ q7_t x0, x1, x2, x3; /* Temporary variables to hold state */
+ q7_t c0; /* Temporary variable to hold coefficient value */
+ q7_t *px; /* Temporary pointer for state */
+ q7_t *pb; /* Temporary pointer for coefficient buffer */
+ q31_t acc0, acc1, acc2, acc3; /* Accumulators */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t i, tapCnt, blkCnt; /* Loop counters */
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Apply loop unrolling and compute 4 output values simultaneously.
+ * The variables acc0 ... acc3 hold output values that are being computed:
+ *
+ * acc0 = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0]
+ * acc1 = b[numTaps-1] * x[n-numTaps] + b[numTaps-2] * x[n-numTaps-1] + b[numTaps-3] * x[n-numTaps-2] +...+ b[0] * x[1]
+ * acc2 = b[numTaps-1] * x[n-numTaps+1] + b[numTaps-2] * x[n-numTaps] + b[numTaps-3] * x[n-numTaps-1] +...+ b[0] * x[2]
+ * acc3 = b[numTaps-1] * x[n-numTaps+2] + b[numTaps-2] * x[n-numTaps+1] + b[numTaps-3] * x[n-numTaps] +...+ b[0] * x[3]
+ */
+ blkCnt = blockSize >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Copy four new input samples into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set all accumulators to zero */
+ acc0 = 0;
+ acc1 = 0;
+ acc2 = 0;
+ acc3 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Read the first three samples from the state buffer:
+ * x[n-numTaps], x[n-numTaps-1], x[n-numTaps-2] */
+ x0 = *(px++);
+ x1 = *(px++);
+ x2 = *(px++);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+ i = tapCnt;
+
+ while(i > 0u)
+ {
+ /* Read the b[numTaps] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-3] sample */
+ x3 = *(px++);
+
+ /* acc0 += b[numTaps] * x[n-numTaps] */
+ acc0 += ((q15_t) x0 * c0);
+
+ /* acc1 += b[numTaps] * x[n-numTaps-1] */
+ acc1 += ((q15_t) x1 * c0);
+
+ /* acc2 += b[numTaps] * x[n-numTaps-2] */
+ acc2 += ((q15_t) x2 * c0);
+
+ /* acc3 += b[numTaps] * x[n-numTaps-3] */
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Read the b[numTaps-1] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-4] sample */
+ x0 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x1 * c0);
+ acc1 += ((q15_t) x2 * c0);
+ acc2 += ((q15_t) x3 * c0);
+ acc3 += ((q15_t) x0 * c0);
+
+ /* Read the b[numTaps-2] coefficient */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-5] sample */
+ x1 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x2 * c0);
+ acc1 += ((q15_t) x3 * c0);
+ acc2 += ((q15_t) x0 * c0);
+ acc3 += ((q15_t) x1 * c0);
+ /* Read the b[numTaps-3] coefficients */
+ c0 = *(pb++);
+
+ /* Read x[n-numTaps-6] sample */
+ x2 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x3 * c0);
+ acc1 += ((q15_t) x0 * c0);
+ acc2 += ((q15_t) x1 * c0);
+ acc3 += ((q15_t) x2 * c0);
+ i--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+
+ i = numTaps - (tapCnt * 4u);
+ while(i > 0u)
+ {
+ /* Read coefficients */
+ c0 = *(pb++);
+
+ /* Fetch 1 state variable */
+ x3 = *(px++);
+
+ /* Perform the multiply-accumulates */
+ acc0 += ((q15_t) x0 * c0);
+ acc1 += ((q15_t) x1 * c0);
+ acc2 += ((q15_t) x2 * c0);
+ acc3 += ((q15_t) x3 * c0);
+
+ /* Reuse the present sample states for next sample */
+ x0 = x1;
+ x1 = x2;
+ x2 = x3;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 4;
+
+ /* The results in the 4 accumulators are in 2.62 format. Convert to 1.31
+ ** Then store the 4 outputs in the destination buffer. */
+ acc0 = __SSAT((acc0 >> 7u), 8);
+ *pDst++ = acc0;
+ acc1 = __SSAT((acc1 >> 7u), 8);
+ *pDst++ = acc1;
+ acc2 = __SSAT((acc2 >> 7u), 8);
+ *pDst++ = acc2;
+ acc3 = __SSAT((acc3 >> 7u), 8);
+ *pDst++ = acc3;
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set the accumulator to zero */
+ acc0 = 0;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize Coefficient pointer */
+ pb = (pCoeffs);
+
+ i = numTaps;
+
+ /* Perform the multiply-accumulates */
+ do
+ {
+ acc0 += (q15_t) * (px++) * (*(pb++));
+ i--;
+ } while(i > 0u);
+
+ /* The result is in 2.14 format. Convert to 1.7
+ ** Then store the output in the destination buffer. */
+ *pDst++ = __SSAT((acc0 >> 7u), 8);
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the samples loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ uint32_t numTaps = S->numTaps; /* Number of taps in the filter */
+ uint32_t i, blkCnt; /* Loop counters */
+ q7_t *pState = S->pState; /* State pointer */
+ q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *px, *pb; /* Temporary pointers to state and coeff */
+ q31_t acc = 0; /* Accumlator */
+ q7_t *pStateCurnt; /* Points to the current sample of the state */
+
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = S->pState + (numTaps - 1u);
+
+ /* Initialize blkCnt with blockSize */
+ blkCnt = blockSize;
+
+ /* Perform filtering upto BlockSize - BlockSize%4 */
+ while(blkCnt > 0u)
+ {
+ /* Copy one sample at a time into state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Set accumulator to zero */
+ acc = 0;
+
+ /* Initialize state pointer of type q7 */
+ px = pState;
+
+ /* Initialize coeff pointer of type q7 */
+ pb = pCoeffs;
+
+
+ i = numTaps;
+
+ while(i > 0u)
+ {
+ /* acc = b[numTaps-1] * x[n-numTaps-1] + b[numTaps-2] * x[n-numTaps-2] + b[numTaps-3] * x[n-numTaps-3] +...+ b[0] * x[0] */
+ acc += (q15_t) * px++ * *pb++;
+ i--;
+ }
+
+ /* Store the 1.7 format filter output in destination buffer */
+ *pDst++ = (q7_t) __SSAT((acc >> 7), 8);
+
+ /* Advance the state pointer by 1 to process the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete.
+ ** Now copy the last numTaps - 1 samples to the satrt of the state buffer.
+ ** This prepares the state buffer for the next function call. */
+
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = S->pState;
+
+
+ /* Copy numTaps number of values */
+ i = (numTaps - 1u);
+
+ /* Copy q7_t data */
+ while(i > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ i--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c
new file mode 100644
index 0000000..d66a20e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_f32.c
@@ -0,0 +1,364 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_f32.c
+*
+* Description: Floating-point sparse FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup FIR_Sparse Finite Impulse Response (FIR) Sparse Filters
+ *
+ * This group of functions implements sparse FIR filters.
+ * Sparse FIR filters are equivalent to standard FIR filters except that most of the coefficients are equal to zero.
+ * Sparse filters are used for simulating reflections in communications and audio applications.
+ *
+ * There are separate functions for Q7, Q15, Q31, and floating-point data types.
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * blockSize samples through the filter. pSrc and
+ * pDst points to input and output arrays respectively containing blockSize values.
+ *
+ * \par Algorithm:
+ * The sparse filter instant structure contains an array of tap indices pTapDelay which specifies the locations of the non-zero coefficients.
+ * This is in addition to the coefficient array b.
+ * The implementation essentially skips the multiplications by zero and leads to an efficient realization.
+ *
+ * \par
+ * \image html FIRSparse.gif "Sparse FIR filter. b[n] represents the filter coefficients"
+ * \par
+ * pCoeffs points to a coefficient array of size numTaps;
+ * pTapDelay points to an array of nonzero indices and is also of size numTaps;
+ * pState points to a state array of size maxDelay + blockSize, where
+ * maxDelay is the largest offset value that is ever used in the pTapDelay array.
+ * Some of the processing functions also require temporary working buffers.
+ *
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient and offset arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 4 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 4 different data type filter instance structures
+ *
+ * \par
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the sparse FIR filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point sparse FIR filter.
+ * @param[in] *S points to an instance of the floating-point sparse FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ */
+
+void arm_fir_sparse_f32(
+ arm_fir_sparse_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ float32_t * pScratchIn,
+ uint32_t blockSize)
+{
+
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *px; /* Scratch buffer pointer */
+ float32_t *py = pState; /* Temporary pointers for state buffer */
+ float32_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ float32_t *pOut; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ float32_t coeff = *pCoeffs++; /* Read the first coefficient value */
+
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+ (int32_t *) pSrc, 1, blockSize);
+
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer */
+ px = pb;
+
+ /* Working pointer for destination buffer */
+ pOut = pDst;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 Multiplications at a time. */
+ blkCnt = blockSize >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer */
+ px = pb;
+
+ /* Working pointer for destination buffer */
+ pOut = pDst;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 MACS at a time. */
+ blkCnt = blockSize >> 2u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex -
+ (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+#else
+
+/* Run the below code for Cortex-M0 */
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in destination buffer */
+ *pOut++ = *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer */
+ px = pb;
+
+ /* Working pointer for destination buffer */
+ pOut = pDst;
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pOut++ += *px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex =
+ ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
new file mode 100644
index 0000000..d99d0ad
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_f32.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_init_f32.c
+*
+* Description: Floating-point sparse FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point sparse FIR filter.
+ * @param[in,out] *S points to an instance of the floating-point sparse FIR structure.
+ * @param[in] numTaps number of nonzero coefficients in the filter.
+ * @param[in] *pCoeffs points to the array of filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] *pTapDelay points to the array of offset times.
+ * @param[in] maxDelay maximum offset time supported.
+ * @param[in] blockSize number of samples that will be processed per block.
+ * @return none
+ *
+ * Description:
+ * \par
+ * pCoeffs holds the filter coefficients and has length numTaps.
+ * pState holds the filter's state variables and must be of length
+ * maxDelay + blockSize, where maxDelay
+ * is the maximum number of delay line values.
+ * blockSize is the
+ * number of samples processed by the arm_fir_sparse_f32() function.
+ */
+
+void arm_fir_sparse_init_f32(
+ arm_fir_sparse_instance_f32 * S,
+ uint16_t numTaps,
+ float32_t * pCoeffs,
+ float32_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0u;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
new file mode 100644
index 0000000..2390862
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q15.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_init_q15.c
+*
+* Description: Q15 sparse FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q15 sparse FIR filter.
+ * @param[in,out] *S points to an instance of the Q15 sparse FIR structure.
+ * @param[in] numTaps number of nonzero coefficients in the filter.
+ * @param[in] *pCoeffs points to the array of filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] *pTapDelay points to the array of offset times.
+ * @param[in] maxDelay maximum offset time supported.
+ * @param[in] blockSize number of samples that will be processed per block.
+ * @return none
+ *
+ * Description:
+ * \par
+ * pCoeffs holds the filter coefficients and has length numTaps.
+ * pState holds the filter's state variables and must be of length
+ * maxDelay + blockSize, where maxDelay
+ * is the maximum number of delay line values.
+ * blockSize is the
+ * number of words processed by arm_fir_sparse_q15() function.
+ */
+
+void arm_fir_sparse_init_q15(
+ arm_fir_sparse_instance_q15 * S,
+ uint16_t numTaps,
+ q15_t * pCoeffs,
+ q15_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0u;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
new file mode 100644
index 0000000..6bee756
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q31.c
@@ -0,0 +1,100 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_init_q31.c
+*
+* Description: Q31 sparse FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q31 sparse FIR filter.
+ * @param[in,out] *S points to an instance of the Q31 sparse FIR structure.
+ * @param[in] numTaps number of nonzero coefficients in the filter.
+ * @param[in] *pCoeffs points to the array of filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] *pTapDelay points to the array of offset times.
+ * @param[in] maxDelay maximum offset time supported.
+ * @param[in] blockSize number of samples that will be processed per block.
+ * @return none
+ *
+ * Description:
+ * \par
+ * pCoeffs holds the filter coefficients and has length numTaps.
+ * pState holds the filter's state variables and must be of length
+ * maxDelay + blockSize, where maxDelay
+ * is the maximum number of delay line values.
+ * blockSize is the number of words processed by arm_fir_sparse_q31() function.
+ */
+
+void arm_fir_sparse_init_q31(
+ arm_fir_sparse_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0u;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
new file mode 100644
index 0000000..af417d2
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_init_q7.c
@@ -0,0 +1,101 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_init_q7.c
+*
+* Description: Q7 sparse FIR filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the Q7 sparse FIR filter.
+ * @param[in,out] *S points to an instance of the Q7 sparse FIR structure.
+ * @param[in] numTaps number of nonzero coefficients in the filter.
+ * @param[in] *pCoeffs points to the array of filter coefficients.
+ * @param[in] *pState points to the state buffer.
+ * @param[in] *pTapDelay points to the array of offset times.
+ * @param[in] maxDelay maximum offset time supported.
+ * @param[in] blockSize number of samples that will be processed per block.
+ * @return none
+ *
+ * Description:
+ * \par
+ * pCoeffs holds the filter coefficients and has length numTaps.
+ * pState holds the filter's state variables and must be of length
+ * maxDelay + blockSize, where maxDelay
+ * is the maximum number of delay line values.
+ * blockSize is the
+ * number of samples processed by the arm_fir_sparse_q7() function.
+ */
+
+void arm_fir_sparse_init_q7(
+ arm_fir_sparse_instance_q7 * S,
+ uint16_t numTaps,
+ q7_t * pCoeffs,
+ q7_t * pState,
+ int32_t * pTapDelay,
+ uint16_t maxDelay,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Assign TapDelay pointer */
+ S->pTapDelay = pTapDelay;
+
+ /* Assign MaxDelay */
+ S->maxDelay = maxDelay;
+
+ /* reset the stateIndex to 0 */
+ S->stateIndex = 0u;
+
+ /* Clear state buffer and size is always maxDelay + blockSize */
+ memset(pState, 0, (maxDelay + blockSize) * sizeof(q7_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c
new file mode 100644
index 0000000..c9fb4d2
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q15.c
@@ -0,0 +1,405 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_q15.c
+*
+* Description: Q15 sparse FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+#include "arm_math.h"
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 sparse FIR filter.
+ * @param[in] *S points to an instance of the Q15 sparse FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 32-bit accumulator.
+ * The 1.15 x 1.15 multiplications yield a 2.30 result and these are added to a 2.30 accumulator.
+ * Thus the full precision of the multiplications is maintained but there is only a single guard bit in the accumulator.
+ * If the accumulator result overflows it will wrap around rather than saturate.
+ * After all multiply-accumulates are performed, the 2.30 accumulator is truncated to 2.15 format and then saturated to 1.15 format.
+ * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ */
+
+
+void arm_fir_sparse_q15(
+ arm_fir_sparse_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ q15_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
+{
+
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pIn = pSrc; /* Working pointer for input */
+ q15_t *pOut = pDst; /* Working pointer for output */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *px; /* Temporary pointers for scratch buffer */
+ q15_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q15_t *py = pState; /* Temporary pointers for state buffer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Filter order */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q15_t coeff = *pCoeffs++; /* Read the first coefficient value */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for pScratchOut */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in1, in2; /* Temporary variables */
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, delaySize, &readIndex, 1,
+ pb, pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 multiplications at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, delaySize, &readIndex, 1,
+ pb, pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 MACS at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+ /* Loop over the blockSize. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ in1 = *pScr2++;
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
+ 16);
+
+#else
+ *__SIMD32(pOut)++ =
+ __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
+ 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ in1 = *pScr2++;
+
+ in2 = *pScr2++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *__SIMD32(pOut)++ =
+ __PKHBT((q15_t) __SSAT(in1 >> 15, 16), (q15_t) __SSAT(in2 >> 15, 16),
+ 16);
+
+#else
+
+ *__SIMD32(pOut)++ =
+ __PKHBT((q15_t) __SSAT(in2 >> 15, 16), (q15_t) __SSAT(in1 >> 15, 16),
+ 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+
+ blkCnt--;
+
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ remaining samples are processed in the below loop */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q15(py, delaySize, &S->stateIndex, 1, pIn, 1, blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, delaySize, &readIndex, 1,
+ pb, pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q15(py, delaySize, &readIndex, 1,
+ pb, pb, blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ *pScratchOut++ += (q31_t) * px++ * coeff;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+ /* Loop over the blockSize. */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ *pOut++ = (q15_t) __SSAT(*pScr2++ >> 15, 16);
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c
new file mode 100644
index 0000000..771736b
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q31.c
@@ -0,0 +1,369 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_q31.c
+*
+* Description: Q31 sparse FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+#include "arm_math.h"
+
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q31 sparse FIR filter.
+ * @param[in] *S points to an instance of the Q31 sparse FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 32-bit accumulator.
+ * The 1.31 x 1.31 multiplications are truncated to 2.30 format.
+ * This leads to loss of precision on the intermediate multiplications and provides only a single guard bit.
+ * If the accumulator result overflows, it wraps around rather than saturate.
+ * In order to avoid overflows the input signal or coefficients must be scaled down by log2(numTaps) bits.
+ */
+
+void arm_fir_sparse_q31(
+ arm_fir_sparse_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ q31_t * pScratchIn,
+ uint32_t blockSize)
+{
+
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *px; /* Scratch buffer pointer */
+ q31_t *py = pState; /* Temporary pointers for state buffer */
+ q31_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q31_t *pOut; /* Destination pointer */
+ q63_t out; /* Temporary output variable */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Filter order */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q31_t coeff = *pCoeffs++; /* Read the first coefficient value */
+ q31_t in;
+
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_f32((int32_t *) py, delaySize, &S->stateIndex, 1,
+ (int32_t *) pSrc, 1, blockSize);
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 Multiplications at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in the destination buffer */
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in the destination buffer */
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 MACS at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* Working output pointer is updated */
+ pOut = pDst;
+
+ /* Output is converted into 1.31 format. */
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * process 4 output samples at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * process the remaining output samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiplications and store in the destination buffer */
+ *pOut++ = (q31_t) (((q63_t) * px++ * coeff) >> 32);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_f32((int32_t *) py, delaySize, &readIndex, 1,
+ (int32_t *) pb, (int32_t *) pb, blockSize, 1,
+ blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pOut = pDst;
+
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ out = *pOut;
+ out += ((q63_t) * px++ * coeff) >> 32;
+ *pOut++ = (q31_t) (out);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = (int32_t) (S->stateIndex - blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* Working output pointer is updated */
+ pOut = pDst;
+
+ /* Output is converted into 1.31 format. */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ in = *pOut << 1;
+ *pOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c
new file mode 100644
index 0000000..22cba03
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_fir_sparse_q7.c
@@ -0,0 +1,397 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fir_sparse_q7.c
+*
+* Description: Q7 sparse FIR filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ------------------------------------------------------------------- */
+#include "arm_math.h"
+
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup FIR_Sparse
+ * @{
+ */
+
+
+/**
+ * @brief Processing function for the Q7 sparse FIR filter.
+ * @param[in] *S points to an instance of the Q7 sparse FIR structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data
+ * @param[in] *pScratchIn points to a temporary buffer of size blockSize.
+ * @param[in] *pScratchOut points to a temporary buffer of size blockSize.
+ * @param[in] blockSize number of input samples to process per call.
+ * @return none.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.7 format and multiplications yield a 2.14 result.
+ * The 2.14 intermediate results are accumulated in a 32-bit accumulator in 18.14 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * The accumulator is then converted to 18.7 format by discarding the low 7 bits.
+ * Finally, the result is truncated to 1.7 format.
+ */
+
+void arm_fir_sparse_q7(
+ arm_fir_sparse_instance_q7 * S,
+ q7_t * pSrc,
+ q7_t * pDst,
+ q7_t * pScratchIn,
+ q31_t * pScratchOut,
+ uint32_t blockSize)
+{
+
+ q7_t *pState = S->pState; /* State pointer */
+ q7_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q7_t *px; /* Scratch buffer pointer */
+ q7_t *py = pState; /* Temporary pointers for state buffer */
+ q7_t *pb = pScratchIn; /* Temporary pointers for scratch buffer */
+ q7_t *pOut = pDst; /* Destination pointer */
+ int32_t *pTapDelay = S->pTapDelay; /* Pointer to the array containing offset of the non-zero tap values. */
+ uint32_t delaySize = S->maxDelay + blockSize; /* state length */
+ uint16_t numTaps = S->numTaps; /* Filter order */
+ int32_t readIndex; /* Read index of the state buffer */
+ uint32_t tapCnt, blkCnt; /* loop counters */
+ q7_t coeff = *pCoeffs++; /* Read the coefficient value */
+ q31_t *pScr2 = pScratchOut; /* Working pointer for scratch buffer of output values */
+ q31_t in;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t in1, in2, in3, in4;
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
+ blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+ (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 multiplications at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+ (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize. Unroll by a factor of 4.
+ * Compute 4 MACS at a time. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ * compute the remaining samples */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex -
+ (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+ /* Loop over the blockSize. */
+ blkCnt = blockSize >> 2;
+
+ while(blkCnt > 0u)
+ {
+ in1 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in2 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in3 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+ in4 = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+ *__SIMD32(pOut)++ = __PACKq7(in1, in2, in3, in4);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4,
+ remaining samples are processed in the below loop */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* BlockSize of Input samples are copied into the state buffer */
+ /* StateIndex points to the starting position to write in the state buffer */
+ arm_circularWrite_q7(py, (int32_t) delaySize, &S->stateIndex, 1, pSrc, 1,
+ blockSize);
+
+ /* Loop over the number of taps. */
+ tapCnt = numTaps;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+ (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform multiplication and store in the scratch buffer */
+ *pScratchOut++ = ((q31_t) * px++ * coeff);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex = ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Loop over the number of taps. */
+ tapCnt = (uint32_t) numTaps - 1u;
+
+ while(tapCnt > 0u)
+ {
+ /* Working pointer for state buffer is updated */
+ py = pState;
+
+ /* blockSize samples are read from the state buffer */
+ arm_circularRead_q7(py, (int32_t) delaySize, &readIndex, 1, pb, pb,
+ (int32_t) blockSize, 1, blockSize);
+
+ /* Working pointer for the scratch buffer of state values */
+ px = pb;
+
+ /* Working pointer for scratch buffer of output values */
+ pScratchOut = pScr2;
+
+ /* Loop over the blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Perform Multiply-Accumulate */
+ in = *pScratchOut + ((q31_t) * px++ * coeff);
+ *pScratchOut++ = in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Load the coefficient value and
+ * increment the coefficient buffer for the next set of state values */
+ coeff = *pCoeffs++;
+
+ /* Read Index, from where the state buffer should be read, is calculated. */
+ readIndex =
+ ((int32_t) S->stateIndex - (int32_t) blockSize) - *pTapDelay++;
+
+ /* Wraparound of readIndex */
+ if(readIndex < 0)
+ {
+ readIndex += (int32_t) delaySize;
+ }
+
+ /* Decrement the tap loop counter */
+ tapCnt--;
+ }
+
+ /* All the output values are in pScratchOut buffer.
+ Convert them into 1.15 format, saturate and store in the destination buffer. */
+ /* Loop over the blockSize. */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ *pOut++ = (q7_t) __SSAT(*pScr2++ >> 7, 8);
+
+ /* Decrement the blockSize loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of FIR_Sparse group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c
new file mode 100644
index 0000000..1f4c0e4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_f32.c
@@ -0,0 +1,439 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_f32.c
+*
+* Description: Floating-point IIR Lattice filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup IIR_Lattice Infinite Impulse Response (IIR) Lattice Filters
+ *
+ * This set of functions implements lattice filters
+ * for Q15, Q31 and floating-point data types. Lattice filters are used in a
+ * variety of adaptive filter applications. The filter structure has feedforward and
+ * feedback components and the net impulse response is infinite length.
+ * The functions operate on blocks
+ * of input and output data and each call to the function processes
+ * blockSize samples through the filter. pSrc and
+ * pDst point to input and output arrays containing blockSize values.
+
+ * \par Algorithm:
+ * \image html IIRLattice.gif "Infinite Impulse Response Lattice filter"
+ *
+ * fN(n) = x(n)
+ * fm-1(n) = fm(n) - km * gm-1(n-1) for m = N, N-1, ...1
+ * gm(n) = km * fm-1(n) + gm-1(n-1) for m = N, N-1, ...1
+ * y(n) = vN * gN(n) + vN-1 * gN-1(n) + ...+ v0 * g0(n)
+ *
+ * \par
+ * pkCoeffs points to array of reflection coefficients of size numStages.
+ * Reflection coefficients are stored in time-reversed order.
+ * \par
+ *
+ * {kN, kN-1, ....k1}
+ *
+ * pvCoeffs points to the array of ladder coefficients of size (numStages+1).
+ * Ladder coefficients are stored in time-reversed order.
+ * \par
+ *
+ * {vN, vN-1, ...v0}
+ *
+ * pState points to a state array of size numStages + blockSize.
+ * The state variables shown in the figure above (the g values) are stored in the pState array.
+ * The state variables are updated after each block of data is processed; the coefficients are untouched.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter.
+ * Coefficient arrays may be shared among several instances while state variable arrays cannot be shared.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ *
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros and then manually initialize the instance structure as follows:
+ *
+ *arm_iir_lattice_instance_f32 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q31 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *arm_iir_lattice_instance_q15 S = {numStages, pState, pkCoeffs, pvCoeffs};
+ *
+ * \par
+ * where numStages is the number of stages in the filter; pState points to the state buffer array;
+ * pkCoeffs points to array of the reflection coefficients; pvCoeffs points to the array of ladder coefficients.
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the IIR lattice filter functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+void arm_iir_lattice_f32(
+ const arm_iir_lattice_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t fnext1, gcurr1, gnext; /* Temporary variables for lattice stages */
+ float32_t acc; /* Accumlator */
+ uint32_t blkCnt, tapCnt; /* temporary variables for counts */
+ float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* number of stages */
+ float32_t *pState; /* State pointer */
+ float32_t *pStateCurnt; /* State current pointer */
+ float32_t k1, k2;
+ float32_t v1, v2, v3, v4;
+ float32_t gcurr2;
+ float32_t fnext2;
+
+ /* initialise loop count */
+ blkCnt = blockSize;
+
+ /* initialise state pointer */
+ pState = &S->pState[0];
+
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fnext2 = *pSrc++;
+
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+
+ /* Set accumulator to zero */
+ acc = 0.0;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = (numStages) >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Read gN-1(n-1) from state buffer */
+ gcurr1 = *px1;
+
+ /* read reflection coefficient kN */
+ k1 = *pk;
+
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext1 = fnext2 - (k1 * gcurr1);
+
+ /* read ladder coefficient vN */
+ v1 = *pv;
+
+ /* read next reflection coefficient kN-1 */
+ k2 = *(pk + 1u);
+
+ /* Read gN-2(n-1) from state buffer */
+ gcurr2 = *(px1 + 1u);
+
+ /* read next ladder coefficient vN-1 */
+ v2 = *(pv + 1u);
+
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext2 = fnext1 - (k2 * gcurr2);
+
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = gcurr1 + (k1 * fnext1);
+
+ /* read reflection coefficient kN-2 */
+ k1 = *(pk + 2u);
+
+ /* write gN(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Read gN-3(n-1) from state buffer */
+ gcurr1 = *(px1 + 2u);
+
+ /* y(n) += gN(n) * vN */
+ acc += (gnext * v1);
+
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fnext1 = fnext2 - (k1 * gcurr1);
+
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = gcurr2 + (k2 * fnext2);
+
+ /* Read gN-4(n-1) from state buffer */
+ gcurr2 = *(px1 + 3u);
+
+ /* y(n) += gN-1(n) * vN-1 */
+ acc += (gnext * v2);
+
+ /* read reflection coefficient kN-3 */
+ k2 = *(pk + 3u);
+
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext2 = fnext1 - (k2 * gcurr2);
+
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = gcurr1 + (k1 * fnext1);
+
+ /* read ladder coefficient vN-2 */
+ v3 = *(pv + 2u);
+
+ /* y(n) += gN-2(n) * vN-2 */
+ acc += (gnext * v3);
+
+ /* write gN-2(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* update pointer */
+ pk += 4u;
+
+ /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+ gnext = (fnext2 * k2) + gcurr2;
+
+ /* read next ladder coefficient vN-3 */
+ v4 = *(pv + 3u);
+
+ /* y(n) += gN-4(n) * vN-4 */
+ acc += (gnext * v4);
+
+ /* write gN-3(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* update pointers */
+ px1 += 4u;
+ pv += 4u;
+
+ tapCnt--;
+
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = (numStages) % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ gcurr1 = *px1++;
+ /* Process sample for last taps */
+ fnext1 = fnext2 - ((*pk) * gcurr1);
+ gnext = (fnext1 * (*pk++)) + gcurr1;
+ /* Output samples for last taps */
+ acc += (gnext * (*pv++));
+ *px2++ = gnext;
+ fnext2 = fnext1;
+
+ tapCnt--;
+
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (fnext2 * (*pv));
+
+ *px2++ = fnext2;
+
+ /* write out into pDst */
+ *pDst++ = acc;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1u;
+
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numStages) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+}
+
+#else
+
+void arm_iir_lattice_f32(
+ const arm_iir_lattice_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t fcurr, fnext = 0, gcurr, gnext; /* Temporary variables for lattice stages */
+ float32_t acc; /* Accumlator */
+ uint32_t blkCnt, tapCnt; /* temporary variables for counts */
+ float32_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* number of stages */
+ float32_t *pState; /* State pointer */
+ float32_t *pStateCurnt; /* State current pointer */
+
+
+ /* Run the below code for Cortex-M0 */
+
+ blkCnt = blockSize;
+
+ pState = &S->pState[0];
+
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0.0f;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+
+ /* Process sample for numStages */
+ tapCnt = numStages;
+
+ while(tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = fcurr - ((*pk) * gcurr);
+ gnext = (fnext * (*pk++)) + gcurr;
+
+ /* Output samples for last taps */
+ acc += (gnext * (*pv++));
+ *px2++ = gnext;
+ fcurr = fnext;
+
+ /* Decrementing loop counter */
+ tapCnt--;
+
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (fnext * (*pv));
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = acc;
+
+ /* Advance the state pointer by 1 to process the next group of samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages;
+
+ /* Copy the data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+}
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
new file mode 100644
index 0000000..edbdb8c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_f32.c
@@ -0,0 +1,85 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_init_f32.c
+*
+* Description: Floating-point IIR lattice filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Initialization function for the floating-point IIR lattice filter.
+ * @param[in] *S points to an instance of the floating-point IIR lattice structure.
+ * @param[in] numStages number of stages in the filter.
+ * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages.
+ * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1.
+ * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_iir_lattice_init_f32(
+ arm_iir_lattice_instance_f32 * S,
+ uint16_t numStages,
+ float32_t * pkCoeffs,
+ float32_t * pvCoeffs,
+ float32_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(float32_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+
+}
+
+ /**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
new file mode 100644
index 0000000..72d98e1
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q15.c
@@ -0,0 +1,85 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_init_q15.c
+*
+* Description: Q15 IIR lattice filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for the Q15 IIR lattice filter.
+ * @param[in] *S points to an instance of the Q15 IIR lattice structure.
+ * @param[in] numStages number of stages in the filter.
+ * @param[in] *pkCoeffs points to reflection coefficient buffer. The array is of length numStages.
+ * @param[in] *pvCoeffs points to ladder coefficient buffer. The array is of length numStages+1.
+ * @param[in] *pState points to state buffer. The array is of length numStages+blockSize.
+ * @param[in] blockSize number of samples to process per call.
+ * @return none.
+ */
+
+void arm_iir_lattice_init_q15(
+ arm_iir_lattice_instance_q15 * S,
+ uint16_t numStages,
+ q15_t * pkCoeffs,
+ q15_t * pvCoeffs,
+ q15_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(q15_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+
+}
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
new file mode 100644
index 0000000..af18b2c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_init_q31.c
@@ -0,0 +1,85 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_init_q31.c
+*
+* Description: Initialization function for the Q31 IIR lattice filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for the Q31 IIR lattice filter.
+ * @param[in] *S points to an instance of the Q31 IIR lattice structure.
+ * @param[in] numStages number of stages in the filter.
+ * @param[in] *pkCoeffs points to the reflection coefficient buffer. The array is of length numStages.
+ * @param[in] *pvCoeffs points to the ladder coefficient buffer. The array is of length numStages+1.
+ * @param[in] *pState points to the state buffer. The array is of length numStages+blockSize.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_iir_lattice_init_q31(
+ arm_iir_lattice_instance_q31 * S,
+ uint16_t numStages,
+ q31_t * pkCoeffs,
+ q31_t * pvCoeffs,
+ q31_t * pState,
+ uint32_t blockSize)
+{
+ /* Assign filter taps */
+ S->numStages = numStages;
+
+ /* Assign reflection coefficient pointer */
+ S->pkCoeffs = pkCoeffs;
+
+ /* Assign ladder coefficient pointer */
+ S->pvCoeffs = pvCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numStages */
+ memset(pState, 0, (numStages + blockSize) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+
+}
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c
new file mode 100644
index 0000000..44e0da9
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q15.c
@@ -0,0 +1,456 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_q15.c
+*
+* Description: Q15 IIR lattice filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q15 IIR lattice filter.
+ * @param[in] *S points to an instance of the Q15 IIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ */
+
+void arm_iir_lattice_q15(
+ const arm_iir_lattice_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t fcurr, fnext, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q15_t gnext1, gnext2; /* Temporary variables for lattice stages */
+ uint32_t stgCnt; /* Temporary variables for counts */
+ q63_t acc; /* Accumlator */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+ q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* number of stages */
+ q15_t *pState; /* State pointer */
+ q15_t *pStateCurnt; /* State current pointer */
+ q15_t out; /* Temporary variable for output */
+ q15_t v1, v2;
+ q31_t v; /* Temporary variable for ladder coefficient */
+
+
+ blkCnt = blockSize;
+
+ pState = &S->pState[0];
+
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+
+ /* Process sample for first tap */
+ gcurr = *px1++;
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+ /* write gN(n) into state for next sample processing */
+ *px2++ = (q15_t) gnext;
+ /* y(n) += gN(n) * vN */
+ acc += (q31_t) ((gnext * (*pv++)));
+
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = (numStages - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Process sample for 2nd, 6th ...taps */
+ /* Read gN-2(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 2nd, 6th .. taps */
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext1 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-1(n) into state */
+ *px2++ = (q15_t) gnext1;
+
+
+ /* Process sample for 3nd, 7th ...taps */
+ /* Read gN-3(n-1) from state */
+ gcurr = *px1++;
+ /* Process sample for 3rd, 7th .. taps */
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15);
+ fcurr = __SSAT(fcurr, 16);
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr;
+ gnext2 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-2(n) into state */
+ *px2++ = (q15_t) gnext2;
+
+ /* Read vN-1 and vN-2 at a time */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ v = *__SIMD32(pv)++;
+
+#else
+
+ v1 = *pv++;
+ v2 = *pv++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ v = __PKHBT(v1, v2, 16);
+
+#else
+
+ v = __PKHBT(v2, v1, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+
+ /* Pack gN-1(n) and gN-2(n) */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ gnext = __PKHBT(gnext1, gnext2, 16);
+
+#else
+
+ gnext = __PKHBT(gnext2, gnext1, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* y(n) += gN-1(n) * vN-1 */
+ /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+ /* y(n) += gN-2(n) * vN-2 */
+ /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+ acc = __SMLALD(gnext, v, acc);
+
+
+ /* Process sample for 4th, 8th ...taps */
+ /* Read gN-4(n-1) from state */
+ gcurr = *px1++;
+ /* Process sample for 4th, 8th .. taps */
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN-3(n) = kN-3 * fN-1(n) + gN-1(n-1) */
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext1 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-3(n) for the next sample process */
+ *px2++ = (q15_t) gnext1;
+
+
+ /* Process sample for 5th, 9th ...taps */
+ /* Read gN-5(n-1) from state */
+ gcurr = *px1++;
+ /* Process sample for 5th, 9th .. taps */
+ /* fN-5(n) = fN-4(n) - kN-4 * gN-5(n-1) */
+ fcurr = fnext - (((q31_t) gcurr * (*pk)) >> 15);
+ fcurr = __SSAT(fcurr, 16);
+ /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+ gnext = (((q31_t) fcurr * (*pk++)) >> 15) + gcurr;
+ gnext2 = (q15_t) __SSAT(gnext, 16);
+ /* write gN-4(n) for the next sample process */
+ *px2++ = (q15_t) gnext2;
+
+ /* Read vN-3 and vN-4 at a time */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ v = *__SIMD32(pv)++;
+
+#else
+
+ v1 = *pv++;
+ v2 = *pv++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ v = __PKHBT(v1, v2, 16);
+
+#else
+
+ v = __PKHBT(v2, v1, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+
+ /* Pack gN-3(n) and gN-4(n) */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ gnext = __PKHBT(gnext1, gnext2, 16);
+
+#else
+
+ gnext = __PKHBT(gnext2, gnext1, 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* y(n) += gN-4(n) * vN-4 */
+ /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+ /* y(n) += gN-3(n) * vN-3 */
+ /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+ acc = __SMLALD(gnext, v, acc);
+
+ tapCnt--;
+
+ }
+
+ fnext = fcurr;
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = (numStages - 1u) % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = fcurr - (((q31_t) gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ gnext = (((q31_t) fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+ /* Output samples for last taps */
+ acc += (q31_t) (((q31_t) gnext * (*pv++)));
+ *px2++ = (q15_t) gnext;
+ fcurr = fnext;
+
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q31_t) (((q31_t) fnext * (*pv++)));
+
+ out = (q15_t) __SSAT(acc >> 15, 16);
+ *px2++ = (q15_t) fnext;
+
+ /* write out into pDst */
+ *pDst++ = out;
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ stgCnt = (numStages >> 2u);
+
+ /* copy data */
+ while(stgCnt > 0u)
+ {
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+
+#else
+
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ stgCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ stgCnt = (numStages) % 0x4u;
+
+ /* copy data */
+ while(stgCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ stgCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ uint32_t stgCnt; /* Temporary variables for counts */
+ q63_t acc; /* Accumlator */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+ q15_t *px1, *px2, *pk, *pv; /* temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* number of stages */
+ q15_t *pState; /* State pointer */
+ q15_t *pStateCurnt; /* State current pointer */
+ q15_t out; /* Temporary variable for output */
+
+
+ blkCnt = blockSize;
+
+ pState = &S->pState[0];
+
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ tapCnt = numStages;
+
+ while(tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample */
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = fcurr - ((gcurr * (*pk)) >> 15);
+ fnext = __SSAT(fnext, 16);
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = ((fnext * (*pk++)) >> 15) + gcurr;
+ gnext = __SSAT(gnext, 16);
+ /* Output samples */
+ /* y(n) += gN(n) * vN */
+ acc += (q31_t) ((gnext * (*pv++)));
+ /* write gN(n) into state for next sample processing */
+ *px2++ = (q15_t) gnext;
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q31_t) ((fnext * (*pv++)));
+
+ out = (q15_t) __SSAT(acc >> 15, 16);
+ *px2++ = (q15_t) fnext;
+
+ /* write out into pDst */
+ *pDst++ = out;
+
+ /* Advance the state pointer by 1 to process the next group of samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ stgCnt = numStages;
+
+ /* copy data */
+ while(stgCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ stgCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+
+
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c
new file mode 100644
index 0000000..c67491d
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_iir_lattice_q31.c
@@ -0,0 +1,344 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_iir_lattice_q31.c
+*
+* Description: Q31 IIR lattice filter processing function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup IIR_Lattice
+ * @{
+ */
+
+/**
+ * @brief Processing function for the Q31 IIR lattice filter.
+ * @param[in] *S points to an instance of the Q31 IIR lattice structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[out] *pDst points to the block of output data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clip.
+ * In order to avoid overflows completely the input signal must be scaled down by 2*log2(numStages) bits.
+ * After all multiply-accumulates are performed, the 2.62 accumulator is saturated to 1.32 format and then truncated to 1.31 format.
+ */
+
+void arm_iir_lattice_q31(
+ const arm_iir_lattice_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q31_t fcurr, fnext = 0, gcurr = 0, gnext; /* Temporary variables for lattice stages */
+ q63_t acc; /* Accumlator */
+ uint32_t blkCnt, tapCnt; /* Temporary variables for counts */
+ q31_t *px1, *px2, *pk, *pv; /* Temporary pointers for state and coef */
+ uint32_t numStages = S->numStages; /* number of stages */
+ q31_t *pState; /* State pointer */
+ q31_t *pStateCurnt; /* State current pointer */
+
+ blkCnt = blockSize;
+
+ pState = &S->pState[0];
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+
+ /* Process sample for first tap */
+ gcurr = *px1++;
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* write gN-1(n-1) into state for next sample processing */
+ *px2++ = gnext;
+ /* y(n) += gN(n) * vN */
+ acc += ((q63_t) gnext * *pv++);
+
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = (numStages - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Process sample for 2nd, 6th .. taps */
+ /* Read gN-2(n-1) from state buffer */
+ gcurr = *px1++;
+ /* fN-2(n) = fN-1(n) - kN-1 * gN-2(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ /* gN-1(n) = kN-1 * fN-2(n) + gN-2(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* y(n) += gN-1(n) * vN-1 */
+ /* process for gN-5(n) * vN-5, gN-9(n) * vN-9 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-1(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ /* Process sample for 3nd, 7th ...taps */
+ /* Read gN-3(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 3rd, 7th .. taps */
+ /* fN-3(n) = fN-2(n) - kN-2 * gN-3(n-1) */
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ /* gN-2(n) = kN-2 * fN-3(n) + gN-3(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
+ /* y(n) += gN-2(n) * vN-2 */
+ /* process for gN-6(n) * vN-6, gN-10(n) * vN-10 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-2(n) into state for next sample processing */
+ *px2++ = gnext;
+
+
+ /* Process sample for 4th, 8th ...taps */
+ /* Read gN-4(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 4th, 8th .. taps */
+ /* fN-4(n) = fN-3(n) - kN-3 * gN-4(n-1) */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ /* gN-3(n) = kN-3 * fN-4(n) + gN-4(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* y(n) += gN-3(n) * vN-3 */
+ /* process for gN-7(n) * vN-7, gN-11(n) * vN-11 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-3(n) into state for next sample processing */
+ *px2++ = gnext;
+
+
+ /* Process sample for 5th, 9th ...taps */
+ /* Read gN-5(n-1) from state buffer */
+ gcurr = *px1++;
+ /* Process sample for 5th, 9th .. taps */
+ /* fN-5(n) = fN-4(n) - kN-4 * gN-1(n-1) */
+ fcurr = __QSUB(fnext, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ /* gN-4(n) = kN-4 * fN-5(n) + gN-5(n-1) */
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fcurr * (*pk++)) >> 31));
+ /* y(n) += gN-4(n) * vN-4 */
+ /* process for gN-8(n) * vN-8, gN-12(n) * vN-12 ... */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-4(n) into state for next sample processing */
+ *px2++ = gnext;
+
+ tapCnt--;
+
+ }
+
+ fnext = fcurr;
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = (numStages - 1u) % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample for last taps */
+ fnext = __QSUB(fcurr, (q31_t) (((q63_t) gcurr * (*pk)) >> 31));
+ gnext = __QADD(gcurr, (q31_t) (((q63_t) fnext * (*pk++)) >> 31));
+ /* Output samples for last taps */
+ acc += ((q63_t) gnext * *pv++);
+ *px2++ = gnext;
+ fcurr = fnext;
+
+ tapCnt--;
+
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q63_t) fnext *(
+ *pv++);
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = (q31_t) (acc >> 31u);
+
+ /* Advance the state pointer by 4 to process the next group of 4 samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numStages) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ };
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ /* Sample processing */
+ while(blkCnt > 0u)
+ {
+ /* Read Sample from input buffer */
+ /* fN(n) = x(n) */
+ fcurr = *pSrc++;
+
+ /* Initialize state read pointer */
+ px1 = pState;
+ /* Initialize state write pointer */
+ px2 = pState;
+ /* Set accumulator to zero */
+ acc = 0;
+ /* Initialize Ladder coeff pointer */
+ pv = &S->pvCoeffs[0];
+ /* Initialize Reflection coeff pointer */
+ pk = &S->pkCoeffs[0];
+
+ tapCnt = numStages;
+
+ while(tapCnt > 0u)
+ {
+ gcurr = *px1++;
+ /* Process sample */
+ /* fN-1(n) = fN(n) - kN * gN-1(n-1) */
+ fnext =
+ clip_q63_to_q31(((q63_t) fcurr -
+ ((q31_t) (((q63_t) gcurr * (*pk)) >> 31))));
+ /* gN(n) = kN * fN-1(n) + gN-1(n-1) */
+ gnext =
+ clip_q63_to_q31(((q63_t) gcurr +
+ ((q31_t) (((q63_t) fnext * (*pk++)) >> 31))));
+ /* Output samples */
+ /* y(n) += gN(n) * vN */
+ acc += ((q63_t) gnext * *pv++);
+ /* write gN-1(n-1) into state for next sample processing */
+ *px2++ = gnext;
+ /* Update f values for next coefficient processing */
+ fcurr = fnext;
+
+ tapCnt--;
+ }
+
+ /* y(n) += g0(n) * v0 */
+ acc += (q63_t) fnext *(
+ *pv++);
+
+ *px2++ = fnext;
+
+ /* write out into pDst */
+ *pDst++ = (q31_t) (acc >> 31u);
+
+ /* Advance the state pointer by 1 to process the next group of samples */
+ pState = pState + 1u;
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy last S->numStages samples to start of the buffer
+ for the preperation of next frame process */
+
+ /* Points to the start of the state buffer */
+ pStateCurnt = &S->pState[0];
+ pState = &S->pState[blockSize];
+
+ tapCnt = numStages;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+
+
+
+/**
+ * @} end of IIR_Lattice group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c
new file mode 100644
index 0000000..c6d1579
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_f32.c
@@ -0,0 +1,433 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_f32.c
+*
+* Description: Processing function for the floating-point LMS filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup LMS Least Mean Square (LMS) Filters
+ *
+ * LMS filters are a class of adaptive filters that are able to "learn" an unknown transfer functions.
+ * LMS filters use a gradient descent method in which the filter coefficients are updated based on the instantaneous error signal.
+ * Adaptive filters are often used in communication systems, equalizers, and noise removal.
+ * The CMSIS DSP Library contains LMS filter functions that operate on Q15, Q31, and floating-point data types.
+ * The library also contains normalized LMS filters in which the filter coefficient adaptation is indepedent of the level of the input signal.
+ *
+ * An LMS filter consists of two components as shown below.
+ * The first component is a standard transversal or FIR filter.
+ * The second component is a coefficient update mechanism.
+ * The LMS filter has two input signals.
+ * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ * This "error signal" tends towards zero as the filter adapts.
+ * The LMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ * \image html LMS.gif "Internal structure of the Least Mean Square filter"
+ *
+ * The functions operate on blocks of data and each call to the function processes
+ * blockSize samples through the filter.
+ * pSrc points to input signal, pRef points to reference signal,
+ * pOut points to output signal and pErr points to error signal.
+ * All arrays contain blockSize values.
+ *
+ * The functions operate on a block-by-block basis.
+ * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ * The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+ *
+ * \par Algorithm:
+ * The output signal y[n] is computed by a standard FIR filter:
+ *
+ *
+ * \par
+ * The error signal equals the difference between the reference signal d[n] and the filter output:
+ *
+ * e[n] = d[n] - y[n].
+ *
+ *
+ * \par
+ * After each sample of the error signal is computed, the filter coefficients b[k] are updated on a sample-by-sample basis:
+ *
+ * b[k] = b[k] + e[n] * mu * x[n-k], for k=0, 1, ..., numTaps-1
+ *
+ * where mu is the step size and controls the rate of coefficient convergence.
+ *\par
+ * In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ * Coefficients are stored in time reversed order.
+ * \par
+ *
+ * \par
+ * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ * to be avoided and yields a significant speed improvement.
+ * The state variables are updated after each block of data is processed.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter and
+ * coefficient and state arrays cannot be shared among instances.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Set the values in the state buffer to zeros before static initialization.
+ * The code below statically initializes each of the 3 different data type filter instance structures
+ *
+ * where numTaps is the number of filter coefficients in the filter; pState is the address of the state buffer;
+ * pCoeffs is the address of the coefficient buffer; mu is the step size parameter; and postShift is the shift applied to coefficients.
+ *
+ * \par Fixed-Point Behavior:
+ * Care must be taken when using the Q15 and Q31 versions of the LMS filter.
+ * The following issues must be considered:
+ * - Scaling of coefficients
+ * - Overflow and saturation
+ *
+ * \par Scaling of Coefficients:
+ * Filter coefficients are represented as fractional values and
+ * coefficients are restricted to lie in the range [-1 +1).
+ * The fixed-point functions have an additional scaling parameter postShift.
+ * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ * This essentially scales the filter coefficients by 2^postShift and
+ * allows the filter coefficients to exceed the range [+1 -1).
+ * The value of postShift is set by the user based on the expected gain through the system being modeled.
+ *
+ * \par Overflow and Saturation:
+ * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ * described separately as part of the function specific documentation below.
+ */
+
+/**
+ * @addtogroup LMS
+ * @{
+ */
+
+/**
+ * @details
+ * This function operates on floating-point data types.
+ *
+ * @brief Processing function for floating-point LMS filter.
+ * @param[in] *S points to an instance of the floating-point LMS filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[in] *pRef points to the block of reference data.
+ * @param[out] *pOut points to the block of output data.
+ * @param[out] *pErr points to the block of error data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_lms_f32(
+ const arm_lms_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t sum, e, d; /* accumulator, error, reference data sample */
+ float32_t w = 0.0f; /* weight factor */
+
+ e = 0.0f;
+ d = 0.0f;
+
+ /* S->pState points to state array which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ blkCnt = blockSize;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result in the accumulator, store in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for the updating filter coefficients */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Loop unrolling for (numTaps - 1u) samples copy */
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result is stored in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Weighting factor for the LMS version */
+ w = e * mu;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb = *pb + (w * (*px++));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ * start of the state buffer. This prepares the state buffer for the
+ * next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy (numTaps - 1u) samples */
+ tapCnt = (numTaps - 1u);
+
+ /* Copy the data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of LMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c
new file mode 100644
index 0000000..69c0239
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_init_f32.c
@@ -0,0 +1,89 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_init_f32.c
+*
+* Description: Floating-point LMS filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @addtogroup LMS
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for floating-point LMS filter.
+ * @param[in] *S points to an instance of the floating-point LMS filter structure.
+ * @param[in] numTaps number of filter coefficients.
+ * @param[in] *pCoeffs points to the coefficient buffer.
+ * @param[in] *pState points to state buffer.
+ * @param[in] mu step size that controls filter coefficient updates.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+/**
+ * \par Description:
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * The initial filter coefficients serve as a starting point for the adaptive filter.
+ * pState points to an array of length numTaps+blockSize-1 samples,
+ * where blockSize is the number of input samples processed by each call to
+ * arm_lms_q31().
+ */
+
+void arm_lms_init_q31(
+ arm_lms_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ q31_t mu,
+ uint32_t blockSize,
+ uint32_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, ((uint32_t) numTaps + (blockSize - 1u)) * sizeof(q31_t));
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Assign postShift value to be applied */
+ S->postShift = postShift;
+
+}
+
+/**
+ * @} end of LMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c
new file mode 100644
index 0000000..830ee57
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_f32.c
@@ -0,0 +1,455 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_norm_f32.c
+*
+* Description: Processing function for the floating-point Normalised LMS.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @defgroup LMS_NORM Normalized LMS Filters
+ *
+ * This set of functions implements a commonly used adaptive filter.
+ * It is related to the Least Mean Square (LMS) adaptive filter and includes an additional normalization
+ * factor which increases the adaptation rate of the filter.
+ * The CMSIS DSP Library contains normalized LMS filter functions that operate on Q15, Q31, and floating-point data types.
+ *
+ * A normalized least mean square (NLMS) filter consists of two components as shown below.
+ * The first component is a standard transversal or FIR filter.
+ * The second component is a coefficient update mechanism.
+ * The NLMS filter has two input signals.
+ * The "input" feeds the FIR filter while the "reference input" corresponds to the desired output of the FIR filter.
+ * That is, the FIR filter coefficients are updated so that the output of the FIR filter matches the reference input.
+ * The filter coefficient update mechanism is based on the difference between the FIR filter output and the reference input.
+ * This "error signal" tends towards zero as the filter adapts.
+ * The NLMS processing functions accept the input and reference input signals and generate the filter output and error signal.
+ * \image html LMS.gif "Internal structure of the NLMS adaptive filter"
+ *
+ * The functions operate on blocks of data and each call to the function processes
+ * blockSize samples through the filter.
+ * pSrc points to input signal, pRef points to reference signal,
+ * pOut points to output signal and pErr points to error signal.
+ * All arrays contain blockSize values.
+ *
+ * The functions operate on a block-by-block basis.
+ * Internally, the filter coefficients b[n] are updated on a sample-by-sample basis.
+ * The convergence of the LMS filter is slower compared to the normalized LMS algorithm.
+ *
+ * \par Algorithm:
+ * The output signal y[n] is computed by a standard FIR filter:
+ *
+ * where mu is the step size and controls the rate of coefficient convergence.
+ *\par
+ * In the APIs, pCoeffs points to a coefficient array of size numTaps.
+ * Coefficients are stored in time reversed order.
+ * \par
+ *
+ * \par
+ * Note that the length of the state buffer exceeds the length of the coefficient array by blockSize-1 samples.
+ * The increased state buffer length allows circular addressing, which is traditionally used in FIR filters,
+ * to be avoided and yields a significant speed improvement.
+ * The state variables are updated after each block of data is processed.
+ * \par Instance Structure
+ * The coefficients and state variables for a filter are stored together in an instance data structure.
+ * A separate instance structure must be defined for each filter and
+ * coefficient and state arrays cannot be shared among instances.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Zeros out the values in the state buffer.
+ * \par
+ * Instance structure cannot be placed into a const data section and it is recommended to use the initialization function.
+ * \par Fixed-Point Behavior:
+ * Care must be taken when using the Q15 and Q31 versions of the normalised LMS filter.
+ * The following issues must be considered:
+ * - Scaling of coefficients
+ * - Overflow and saturation
+ *
+ * \par Scaling of Coefficients:
+ * Filter coefficients are represented as fractional values and
+ * coefficients are restricted to lie in the range [-1 +1).
+ * The fixed-point functions have an additional scaling parameter postShift.
+ * At the output of the filter's accumulator is a shift register which shifts the result by postShift bits.
+ * This essentially scales the filter coefficients by 2^postShift and
+ * allows the filter coefficients to exceed the range [+1 -1).
+ * The value of postShift is set by the user based on the expected gain through the system being modeled.
+ *
+ * \par Overflow and Saturation:
+ * Overflow and saturation behavior of the fixed-point Q15 and Q31 versions are
+ * described separately as part of the function specific documentation below.
+ */
+
+
+/**
+ * @addtogroup LMS_NORM
+ * @{
+ */
+
+
+ /**
+ * @brief Processing function for floating-point normalized LMS filter.
+ * @param[in] *S points to an instance of the floating-point normalized LMS filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[in] *pRef points to the block of reference data.
+ * @param[out] *pOut points to the block of output data.
+ * @param[out] *pErr points to the block of error data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ */
+
+void arm_lms_norm_f32(
+ arm_lms_norm_instance_f32 * S,
+ float32_t * pSrc,
+ float32_t * pRef,
+ float32_t * pOut,
+ float32_t * pErr,
+ uint32_t blockSize)
+{
+ float32_t *pState = S->pState; /* State pointer */
+ float32_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ float32_t *pStateCurnt; /* Points to the current sample of the state */
+ float32_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ float32_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ float32_t energy; /* Energy of the input */
+ float32_t sum, e, d; /* accumulator, error, reference data sample */
+ float32_t w, x0, in; /* weight factor, temporary variable to hold input sample and state */
+
+ /* Initializations of error, difference, Coefficient update */
+ e = 0.0f;
+ d = 0.0f;
+ w = 0.0f;
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= x0 * x0;
+ energy += in * in;
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result in the accumulator, store in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ /* epsilon value 0.000000119209289f */
+ w = (e * mu) / (energy + 0.000000119209289f);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+ *pb += w * (*px++);
+ pb++;
+
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ S->energy = energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Loop unrolling for (numTaps - 1u)/4 samples copy */
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= x0 * x0;
+ energy += in * in;
+
+ /* Set the accumulator to zero */
+ sum = 0.0f;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ sum += (*px++) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* The result in the accumulator is stored in the destination buffer. */
+ *pOut++ = sum;
+
+ /* Compute and store error */
+ d = (float32_t) (*pRef++);
+ e = d - sum;
+ *pErr++ = e;
+
+ /* Calculation of Weighting factor for updating filter coefficients */
+ /* epsilon value 0.000000119209289f */
+ w = (e * mu) / (energy + 0.000000119209289f);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCcoeffs pointer */
+ pb = pCoeffs;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb += w * (*px++);
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ S->energy = energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy (numTaps - 1u) samples */
+ tapCnt = (numTaps - 1u);
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of LMS_NORM group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c
new file mode 100644
index 0000000..583fd11
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_init_f32.c
@@ -0,0 +1,99 @@
+/*-----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_norm_init_f32.c
+*
+* Description: Floating-point NLMS filter initialization function.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------*/
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup LMS_NORM
+ * @{
+ */
+
+ /**
+ * @brief Initialization function for floating-point normalized LMS filter.
+ * @param[in] *S points to an instance of the floating-point LMS filter structure.
+ * @param[in] numTaps number of filter coefficients.
+ * @param[in] *pCoeffs points to coefficient buffer.
+ * @param[in] *pState points to state buffer.
+ * @param[in] mu step size that controls filter coefficient updates.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * \par Description:
+ * pCoeffs points to the array of filter coefficients stored in time reversed order:
+ *
+ * The initial filter coefficients serve as a starting point for the adaptive filter.
+ * pState points to an array of length numTaps+blockSize-1 samples,
+ * where blockSize is the number of input samples processed by each call to arm_lms_norm_q31().
+ */
+
+void arm_lms_norm_init_q31(
+ arm_lms_norm_instance_q31 * S,
+ uint16_t numTaps,
+ q31_t * pCoeffs,
+ q31_t * pState,
+ q31_t mu,
+ uint32_t blockSize,
+ uint8_t postShift)
+{
+ /* Assign filter taps */
+ S->numTaps = numTaps;
+
+ /* Assign coefficient pointer */
+ S->pCoeffs = pCoeffs;
+
+ /* Clear state buffer and size is always blockSize + numTaps - 1 */
+ memset(pState, 0, (numTaps + (blockSize - 1u)) * sizeof(q31_t));
+
+ /* Assign post Shift value applied to coefficients */
+ S->postShift = postShift;
+
+ /* Assign state pointer */
+ S->pState = pState;
+
+ /* Assign Step size value */
+ S->mu = mu;
+
+ /* Initialize reciprocal pointer table */
+ S->recipTable = (q31_t *) armRecipTableQ31;
+
+ /* Initialise Energy to zero */
+ S->energy = 0;
+
+ /* Initialise x0 to zero */
+ S->x0 = 0;
+
+}
+
+/**
+ * @} end of LMS_NORM group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c
new file mode 100644
index 0000000..b7757e9
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q15.c
@@ -0,0 +1,434 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_norm_q15.c
+*
+* Description: Q15 NLMS filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup LMS_NORM
+ * @{
+ */
+
+/**
+* @brief Processing function for Q15 normalized LMS filter.
+* @param[in] *S points to an instance of the Q15 normalized LMS filter structure.
+* @param[in] *pSrc points to the block of input data.
+* @param[in] *pRef points to the block of reference data.
+* @param[out] *pOut points to the block of output data.
+* @param[out] *pErr points to the block of error data.
+* @param[in] blockSize number of samples to process.
+* @return none.
+*
+* Scaling and Overflow Behavior:
+* \par
+* The function is implemented using a 64-bit internal accumulator.
+* Both coefficients and state variables are represented in 1.15 format and
+* multiplications yield a 2.30 result. The 2.30 intermediate results are
+* accumulated in a 64-bit accumulator in 34.30 format.
+* There is no risk of internal overflow with this approach and the full
+* precision of intermediate multiplications is preserved. After all additions
+* have been performed, the accumulator is truncated to 34.15 format by
+* discarding low 15 bits. Lastly, the accumulator is saturated to yield a
+* result in 1.15 format.
+*
+* \par
+* In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
+*
+ */
+
+void arm_lms_norm_q15(
+ arm_lms_norm_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q15_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q31_t energy; /* Energy of the input */
+ q63_t acc; /* Accumulator */
+ q15_t e = 0, d = 0; /* error, reference data sample */
+ q15_t w = 0, in; /* weight factor and state */
+ q15_t x0; /* temporary variable to hold input sample */
+ //uint32_t shift = (uint32_t) S->postShift + 1u; /* Shift to be applied to the output */
+ q15_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q15_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Teporary variable for coefficient */
+ q31_t acc_l, acc_h;
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= (((q31_t) x0 * (x0)) >> 15);
+ energy += (((q31_t) in * (in)) >> 15);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+ /* Perform the multiply-accumulate */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
+ acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
+
+#else
+
+ acc += (((q31_t) * px++ * (*pb++)));
+ acc += (((q31_t) * px++ * (*pb++)));
+ acc += (((q31_t) * px++ * (*pb++)));
+ acc += (((q31_t) * px++ * (*pb++)));
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (((q31_t) * px++ * (*pb++)));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16u);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ d = *pRef++;
+ e = d - (q15_t) acc;
+ *pErr++ = e;
+
+ /* Calculation of 1/energy */
+ postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
+ &oneByEnergy, S->recipTable);
+
+ /* Calculation of e * mu value */
+ errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
+
+ /* Calculation of (e * mu) * (1/energy) value */
+ acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
+
+ /* Weighting factor for the normalized version */
+ w = (q15_t) __SSAT((q31_t) acc, 16);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Read the sample from state buffer */
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q15_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Calculation of count for copying integer writes */
+ tapCnt = (numTaps - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+
+#else
+
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+#endif
+
+ tapCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy -= (((q31_t) x0 * (x0)) >> 15);
+ energy += (((q31_t) in * (in)) >> 15);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (((q31_t) * px++ * (*pb++)));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16u);
+
+ /* Converting the result to 1.15 format */
+ //acc = __SSAT((acc >> (16u - shift)), 16u);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ d = *pRef++;
+ e = d - (q15_t) acc;
+ *pErr++ = e;
+
+ /* Calculation of 1/energy */
+ postShift = arm_recip_q15((q15_t) energy + DELTA_Q15,
+ &oneByEnergy, S->recipTable);
+
+ /* Calculation of e * mu value */
+ errorXmu = (q15_t) (((q31_t) e * mu) >> 15);
+
+ /* Calculation of (e * mu) * (1/energy) value */
+ acc = (((q31_t) errorXmu * oneByEnergy) >> (15 - postShift));
+
+ /* Weighting factor for the normalized version */
+ w = (q15_t) __SSAT((q31_t) acc, 16);
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = *pb + (((q31_t) w * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Read the sample from state buffer */
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q15_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* copy (numTaps - 1u) data */
+ tapCnt = (numTaps - 1u);
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+
+/**
+ * @} end of LMS_NORM group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c
new file mode 100644
index 0000000..adb42a5
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_norm_q31.c
@@ -0,0 +1,425 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_norm_q31.c
+*
+* Description: Processing function for the Q31 NLMS filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup LMS_NORM
+ * @{
+ */
+
+/**
+* @brief Processing function for Q31 normalized LMS filter.
+* @param[in] *S points to an instance of the Q31 normalized LMS filter structure.
+* @param[in] *pSrc points to the block of input data.
+* @param[in] *pRef points to the block of reference data.
+* @param[out] *pOut points to the block of output data.
+* @param[out] *pErr points to the block of error data.
+* @param[in] blockSize number of samples to process.
+* @return none.
+*
+* Scaling and Overflow Behavior:
+* \par
+* The function is implemented using an internal 64-bit accumulator.
+* The accumulator has a 2.62 format and maintains full precision of the intermediate
+* multiplication results but provides only a single guard bit.
+* Thus, if the accumulator result overflows it wraps around rather than clip.
+* In order to avoid overflows completely the input signal must be scaled down by
+* log2(numTaps) bits. The reference signal should not be scaled down.
+* After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+* and saturated to 1.31 format to yield the final result.
+* The output signal and error signal are in 1.31 format.
+*
+* \par
+* In this filter, filter coefficients are updated for each sample and the
+* updation of filter cofficients are saturted.
+*
+*/
+
+void arm_lms_norm_q31(
+ arm_lms_norm_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t *px, *pb; /* Temporary pointers for state and coefficient buffers */
+ q31_t mu = S->mu; /* Adaptive factor */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t energy; /* Energy of the input */
+ q63_t acc; /* Accumulator */
+ q31_t e = 0, d = 0; /* error, reference data sample */
+ q31_t w = 0, in; /* weight factor and state */
+ q31_t x0; /* temporary variable to hold input sample */
+// uint32_t shift = 32u - ((uint32_t) S->postShift + 1u); /* Shift to be applied to the output */
+ q31_t errorXmu, oneByEnergy; /* Temporary variables to store error and mu product and reciprocal of energy */
+ q31_t postShift; /* Post shift to be applied to weight after reciprocal calculation */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1u);
+ uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */
+
+ energy = S->energy;
+ x0 = S->x0;
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ while(blkCnt > 0u)
+ {
+
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy = (q31_t) ((((q63_t) energy << 32) -
+ (((q63_t) x0 * x0) << 1)) >> 32);
+ energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+ acc += ((q63_t) (*px++)) * (*pb++);
+ acc += ((q63_t) (*px++)) * (*pb++);
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ d = *pRef++;
+ e = d - (q31_t) acc;
+ *pErr++ = e;
+
+ /* Calculates the reciprocal of energy */
+ postShift = arm_recip_q31(energy + DELTA_Q31,
+ &oneByEnergy, &S->recipTable[0]);
+
+ /* Calculation of product of (e * mu) */
+ errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Weighting factor for the normalized version */
+ w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+
+ /* coef is in 2.30 format */
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ /* get coef in 1.31 format by left shifting */
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ /* update coefficient buffer to next coefficient */
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Read the sample from state buffer */
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q31_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Loop unrolling for (numTaps - 1u) samples copy */
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ while(blkCnt > 0u)
+ {
+
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Read the sample from input buffer */
+ in = *pSrc++;
+
+ /* Update the energy calculation */
+ energy =
+ (q31_t) ((((q63_t) energy << 32) - (((q63_t) x0 * x0) << 1)) >> 32);
+ energy = (q31_t) (((((q63_t) in * in) << 1) + (energy << 32)) >> 32);
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Converting the result to 1.31 format */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+
+ //acc = (q31_t) (acc >> shift);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ d = *pRef++;
+ e = d - (q31_t) acc;
+ *pErr++ = e;
+
+ /* Calculates the reciprocal of energy */
+ postShift =
+ arm_recip_q31(energy + DELTA_Q31, &oneByEnergy, &S->recipTable[0]);
+
+ /* Calculation of product of (e * mu) */
+ errorXmu = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Weighting factor for the normalized version */
+ w = clip_q63_to_q31(((q63_t) errorXmu * oneByEnergy) >> (31 - postShift));
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize coeff pointer */
+ pb = (pCoeffs);
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ /* coef is in 2.30 format */
+ coef = (q31_t) (((q63_t) w * (*px++)) >> (32));
+ /* get coef in 1.31 format by left shifting */
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ /* update coefficient buffer to next coefficient */
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Read the sample from state buffer */
+ x0 = *pState;
+
+ /* Advance state pointer by 1 for the next sample */
+ pState = pState + 1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Save energy and x0 values for the next frame */
+ S->energy = (q31_t) energy;
+ S->x0 = x0;
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ start of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Loop for (numTaps - 1u) samples copy */
+ tapCnt = (numTaps - 1u);
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of LMS_NORM group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c
new file mode 100644
index 0000000..5b87cb1
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q15.c
@@ -0,0 +1,373 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_q15.c
+*
+* Description: Processing function for the Q15 LMS filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup LMS
+ * @{
+ */
+
+ /**
+ * @brief Processing function for Q15 LMS filter.
+ * @param[in] *S points to an instance of the Q15 LMS filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[in] *pRef points to the block of reference data.
+ * @param[out] *pOut points to the block of output data.
+ * @param[out] *pErr points to the block of error data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * \par Scaling and Overflow Behavior:
+ * The function is implemented using a 64-bit internal accumulator.
+ * Both coefficients and state variables are represented in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate results are accumulated in a 64-bit accumulator in 34.30 format.
+ * There is no risk of internal overflow with this approach and the full precision of intermediate multiplications is preserved.
+ * After all additions have been performed, the accumulator is truncated to 34.15 format by discarding low 15 bits.
+ * Lastly, the accumulator is saturated to yield a result in 1.15 format.
+ *
+ * \par
+ * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
+ *
+ */
+
+void arm_lms_q15(
+ const arm_lms_instance_q15 * S,
+ q15_t * pSrc,
+ q15_t * pRef,
+ q15_t * pOut,
+ q15_t * pErr,
+ uint32_t blockSize)
+{
+ q15_t *pState = S->pState; /* State pointer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ q15_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q15_t *pStateCurnt; /* Points to the current sample of the state */
+ q15_t mu = S->mu; /* Adaptive factor */
+ q15_t *px; /* Temporary pointer for state */
+ q15_t *pb; /* Temporary pointer for coefficient buffer */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q15_t e = 0; /* error of data sample */
+ q15_t alpha; /* Intermediate constant for taps update */
+ q31_t acc_l, acc_h;
+ int32_t lShift = (15 - (int32_t) S->postShift); /* Post shift */
+ int32_t uShift = (32 - lShift);
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t coef; /* Teporary variable for coefficient */
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Initializing blkCnt with blockSize */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2u;
+
+ while(tapCnt > 0u)
+ {
+ /* acc += b[N] * x[n-N] + b[N-1] * x[n-N-1] */
+ /* Perform the multiply-accumulate */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
+ acc = __SMLALD(*__SIMD32(px)++, (*__SIMD32(pb)++), acc);
+
+#else
+
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (q63_t) (((q31_t) (*px++) * (*pb++)));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q15_t) acc;
+
+ *pErr++ = (q15_t) e;
+
+ /* Compute alpha i.e. intermediate constant for taps update */
+ alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
+
+ /* Initialize state pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2u;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+ coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) * pb + (((q31_t) alpha * (*px++)) >> 15);
+ *pb++ = (q15_t) __SSAT((coef), 16);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Calculation of count for copying integer writes */
+ tapCnt = (numTaps - 1u) >> 2;
+
+ while(tapCnt > 0u)
+ {
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+ *__SIMD32(pStateCurnt)++ = *__SIMD32(pState)++;
+#else
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+#endif
+
+ tapCnt--;
+
+ }
+
+ /* Calculation of count for remaining q15_t data */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += (q63_t) ((q31_t) (*px++) * (*pb++));
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ /* Apply shift for lower part of acc and upper part of acc */
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Converting the result to 1.15 format and saturate the output */
+ acc = __SSAT(acc, 16);
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q15_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q15_t) acc;
+
+ *pErr++ = (q15_t) e;
+
+ /* Compute alpha i.e. intermediate constant for taps update */
+ alpha = (q15_t) (((q31_t) e * (mu)) >> 15);
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ *pb++ += (q15_t) (((q31_t) alpha * (*px++)) >> 15);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ start of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy (numTaps - 1u) samples */
+ tapCnt = (numTaps - 1u);
+
+ /* Copy the data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of LMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c
new file mode 100644
index 0000000..8e90a6f
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/FilteringFunctions/arm_lms_q31.c
@@ -0,0 +1,363 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_lms_q31.c
+*
+* Description: Processing function for the Q31 LMS filter.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+/**
+ * @ingroup groupFilters
+ */
+
+/**
+ * @addtogroup LMS
+ * @{
+ */
+
+ /**
+ * @brief Processing function for Q31 LMS filter.
+ * @param[in] *S points to an instance of the Q15 LMS filter structure.
+ * @param[in] *pSrc points to the block of input data.
+ * @param[in] *pRef points to the block of reference data.
+ * @param[out] *pOut points to the block of output data.
+ * @param[out] *pErr points to the block of error data.
+ * @param[in] blockSize number of samples to process.
+ * @return none.
+ *
+ * \par Scaling and Overflow Behavior:
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate
+ * multiplication results but provides only a single guard bit.
+ * Thus, if the accumulator result overflows it wraps around rather than clips.
+ * In order to avoid overflows completely the input signal must be scaled down by
+ * log2(numTaps) bits.
+ * The reference signal should not be scaled down.
+ * After all multiply-accumulates are performed, the 2.62 accumulator is shifted
+ * and saturated to 1.31 format to yield the final result.
+ * The output signal and error signal are in 1.31 format.
+ *
+ * \par
+ * In this filter, filter coefficients are updated for each sample and the updation of filter cofficients are saturted.
+ */
+
+void arm_lms_q31(
+ const arm_lms_instance_q31 * S,
+ q31_t * pSrc,
+ q31_t * pRef,
+ q31_t * pOut,
+ q31_t * pErr,
+ uint32_t blockSize)
+{
+ q31_t *pState = S->pState; /* State pointer */
+ uint32_t numTaps = S->numTaps; /* Number of filter coefficients in the filter */
+ q31_t *pCoeffs = S->pCoeffs; /* Coefficient pointer */
+ q31_t *pStateCurnt; /* Points to the current sample of the state */
+ q31_t mu = S->mu; /* Adaptive factor */
+ q31_t *px; /* Temporary pointer for state */
+ q31_t *pb; /* Temporary pointer for coefficient buffer */
+ uint32_t tapCnt, blkCnt; /* Loop counters */
+ q63_t acc; /* Accumulator */
+ q31_t e = 0; /* error of data sample */
+ q31_t alpha; /* Intermediate constant for taps update */
+ q31_t coef; /* Temporary variable for coef */
+ q31_t acc_l, acc_h; /* temporary input */
+ uint32_t uShift = ((uint32_t) S->postShift + 1u);
+ uint32_t lShift = 32u - uShift; /* Shift to be applied to the output */
+
+ /* S->pState points to buffer which contains previous frame (numTaps - 1) samples */
+ /* pStateCurnt points to the location where the new input data should be written */
+ pStateCurnt = &(S->pState[(numTaps - 1u)]);
+
+ /* Initializing blkCnt with blockSize */
+ blkCnt = blockSize;
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize state pointer */
+ px = pState;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ /* acc += b[N] * x[n-N] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-1] * x[n-N-1] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-2] * x[n-N-2] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* acc += b[N-3] * x[n-N-3] */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ /* Store the result from accumulator into the destination buffer. */
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q31_t) acc;
+
+ *pErr++ = (q31_t) e;
+
+ /* Compute alpha i.e. intermediate constant for taps update */
+ alpha = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Initialize state pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize coefficient pointer */
+ pb = pCoeffs;
+
+ /* Loop unrolling. Process 4 taps at a time. */
+ tapCnt = numTaps >> 2;
+
+ /* Update filter coefficients */
+ while(tapCnt > 0u)
+ {
+ /* coef is in 2.30 format */
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ /* get coef in 1.31 format by left shifting */
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ /* update coefficient buffer to next coefficient */
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* If the filter length is not a multiple of 4, compute the remaining filter taps */
+ tapCnt = numTaps % 0x4u;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb = clip_q63_to_q31((q63_t) * pb + (coef << 1u));
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ satrt of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Loop unrolling for (numTaps - 1u) samples copy */
+ tapCnt = (numTaps - 1u) >> 2u;
+
+ /* copy data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Calculate remaining number of copies */
+ tapCnt = (numTaps - 1u) % 0x4u;
+
+ /* Copy the remaining q31_t data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Copy the new input sample into the state buffer */
+ *pStateCurnt++ = *pSrc++;
+
+ /* Initialize pState pointer */
+ px = pState;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Set the accumulator to zero */
+ acc = 0;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ acc += ((q63_t) (*px++)) * (*pb++);
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Converting the result to 1.31 format */
+ /* Store the result from accumulator into the destination buffer. */
+ /* Calc lower part of acc */
+ acc_l = acc & 0xffffffff;
+
+ /* Calc upper part of acc */
+ acc_h = (acc >> 32) & 0xffffffff;
+
+ acc = (uint32_t) acc_l >> lShift | acc_h << uShift;
+
+ *pOut++ = (q31_t) acc;
+
+ /* Compute and store error */
+ e = *pRef++ - (q31_t) acc;
+
+ *pErr++ = (q31_t) e;
+
+ /* Weighting factor for the LMS version */
+ alpha = (q31_t) (((q63_t) e * mu) >> 31);
+
+ /* Initialize pState pointer */
+ /* Advance state pointer by 1 for the next sample */
+ px = pState++;
+
+ /* Initialize pCoeffs pointer */
+ pb = pCoeffs;
+
+ /* Loop over numTaps number of values */
+ tapCnt = numTaps;
+
+ while(tapCnt > 0u)
+ {
+ /* Perform the multiply-accumulate */
+ coef = (q31_t) (((q63_t) alpha * (*px++)) >> (32));
+ *pb += (coef << 1u);
+ pb++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Processing is complete. Now copy the last numTaps - 1 samples to the
+ start of the state buffer. This prepares the state buffer for the
+ next function call. */
+
+ /* Points to the start of the pState buffer */
+ pStateCurnt = S->pState;
+
+ /* Copy (numTaps - 1u) samples */
+ tapCnt = (numTaps - 1u);
+
+ /* Copy the data */
+ while(tapCnt > 0u)
+ {
+ *pStateCurnt++ = *pState++;
+
+ /* Decrement the loop counter */
+ tapCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of LMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/G++/arm_cortexMx_math_Build.bat b/CMSIS/DSP_Lib/Source/G++/arm_cortexMx_math_Build.bat
new file mode 100644
index 0000000..6608a18
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/G++/arm_cortexMx_math_Build.bat
@@ -0,0 +1,17 @@
+
+SET TMP=C:\Temp
+SET TEMP=C:\Temp
+
+SET UVEXE=C:\Keil\UV4\UV4.EXE
+
+@echo Building DSP Library for Cortex-M0 Little Endian
+%UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 LE" -o"DSP_Lib CM0 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M3 Little Endian
+%UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 LE" -o"DSP_Lib CM3 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE" -o"DSP_Lib CM4 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 with FPU Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE FPU" -o"DSP_Lib CM4 LE FPU.txt" -j0
diff --git a/CMSIS/DSP_Lib/Source/GCC/arm_cortexMx_math_Build.bat b/CMSIS/DSP_Lib/Source/GCC/arm_cortexMx_math_Build.bat
new file mode 100644
index 0000000..6608a18
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/GCC/arm_cortexMx_math_Build.bat
@@ -0,0 +1,17 @@
+
+SET TMP=C:\Temp
+SET TEMP=C:\Temp
+
+SET UVEXE=C:\Keil\UV4\UV4.EXE
+
+@echo Building DSP Library for Cortex-M0 Little Endian
+%UVEXE% -rb arm_cortexM0x_math.uvproj -t"DSP_Lib CM0 LE" -o"DSP_Lib CM0 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M3 Little Endian
+%UVEXE% -rb arm_cortexM3x_math.uvproj -t"DSP_Lib CM3 LE" -o"DSP_Lib CM3 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE" -o"DSP_Lib CM4 LE.txt" -j0
+
+@echo Building DSP Library for Cortex-M4 with FPU Little Endian
+%UVEXE% -rb arm_cortexM4x_math.uvproj -t"DSP_Lib CM4 LE FPU" -o"DSP_Lib CM4 LE FPU.txt" -j0
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_f32.c
new file mode 100644
index 0000000..b193e28
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_f32.c
@@ -0,0 +1,205 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_add_f32.c
+*
+* Description: Floating-point matrix addition
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixAdd Matrix Addition
+ *
+ * Adds two matrices.
+ * \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
+ *
+ * The functions check to make sure that
+ * pSrcA, pSrcB, and pDst have the same
+ * number of rows and columns.
+ */
+
+/**
+ * @addtogroup MatrixAdd
+ * @{
+ */
+
+
+/**
+ * @brief Floating-point matrix addition.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+arm_status arm_mat_add_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+#ifndef ARM_MATH_CM0
+
+ float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
+
+#endif // #ifndef ARM_MATH_CM0
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Loop unrolling */
+ blkCnt = numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ /* Read values from source A */
+ inA1 = pIn1[0];
+
+ /* Read values from source B */
+ inB1 = pIn2[0];
+
+ /* Read values from source A */
+ inA2 = pIn1[1];
+
+ /* out = sourceA + sourceB */
+ out1 = inA1 + inB1;
+
+ /* Read values from source B */
+ inB2 = pIn2[1];
+
+ /* Read values from source A */
+ inA1 = pIn1[2];
+
+ /* out = sourceA + sourceB */
+ out2 = inA2 + inB2;
+
+ /* Read values from source B */
+ inB1 = pIn2[2];
+
+ /* Store result in destination */
+ pOut[0] = out1;
+ pOut[1] = out2;
+
+ /* Read values from source A */
+ inA2 = pIn1[3];
+
+ /* Read values from source B */
+ inB2 = pIn2[3];
+
+ /* out = sourceA + sourceB */
+ out1 = inA1 + inB1;
+
+ /* out = sourceA + sourceB */
+ out2 = inA2 + inB2;
+
+ /* Store result in destination */
+ pOut[2] = out1;
+
+ /* Store result in destination */
+ pOut[3] = out2;
+
+
+ /* update pointers to process next sampels */
+ pIn1 += 4u;
+ pIn2 += 4u;
+ pOut += 4u;
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add and then store the results in the destination buffer. */
+ *pOut++ = (*pIn1++) + (*pIn2++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixAdd group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q15.c
new file mode 100644
index 0000000..904273c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q15.c
@@ -0,0 +1,160 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_add_q15.c
+*
+* Description: Q15 matrix addition
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixAdd
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix addition.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+arm_status arm_mat_add_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint16_t) (pSrcA->numRows * pSrcA->numCols);
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop unrolling */
+ blkCnt = (uint32_t) numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add, Saturate and then store the results in the destination buffer. */
+ *__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
+ *__SIMD32(pOut)++ = __QADD16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = (uint32_t) numSamples % 0x4u;
+
+ /* q15 pointers of input and output are initialized */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add, Saturate and then store the results in the destination buffer. */
+ *pOut++ = (q15_t) __QADD16(*pInA++, *pInB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = (uint32_t) numSamples;
+
+
+ /* q15 pointers of input and output are initialized */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add, Saturate and then store the results in the destination buffer. */
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ + *pInB++), 16);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixAdd group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q31.c
new file mode 100644
index 0000000..d4c1f4c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_add_q31.c
@@ -0,0 +1,204 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_add_q31.c
+*
+* Description: Q31 matrix addition
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixAdd
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix addition.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+ */
+
+arm_status arm_mat_add_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t inA1, inB1; /* temporary variables */
+
+#ifndef ARM_MATH_CM0
+
+ q31_t inA2, inB2; /* temporary variables */
+ q31_t out1, out2; /* temporary variables */
+
+#endif // #ifndef ARM_MATH_CM0
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix addition */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2u;
+
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add, saturate and then store the results in the destination buffer. */
+ /* Read values from source A */
+ inA1 = pIn1[0];
+
+ /* Read values from source B */
+ inB1 = pIn2[0];
+
+ /* Read values from source A */
+ inA2 = pIn1[1];
+
+ /* Add and saturate */
+ out1 = __QADD(inA1, inB1);
+
+ /* Read values from source B */
+ inB2 = pIn2[1];
+
+ /* Read values from source A */
+ inA1 = pIn1[2];
+
+ /* Add and saturate */
+ out2 = __QADD(inA2, inB2);
+
+ /* Read values from source B */
+ inB1 = pIn2[2];
+
+ /* Store result in destination */
+ pOut[0] = out1;
+ pOut[1] = out2;
+
+ /* Read values from source A */
+ inA2 = pIn1[3];
+
+ /* Read values from source B */
+ inB2 = pIn2[3];
+
+ /* Add and saturate */
+ out1 = __QADD(inA1, inB1);
+ out2 = __QADD(inA2, inB2);
+
+ /* Store result in destination */
+ pOut[2] = out1;
+ pOut[3] = out2;
+
+ /* update pointers to process next sampels */
+ pIn1 += 4u;
+ pIn2 += 4u;
+ pOut += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) + B(m,n) */
+ /* Add, saturate and then store the results in the destination buffer. */
+ inA1 = *pIn1++;
+ inB1 = *pIn2++;
+
+ inA1 = __QADD(inA1, inB1);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ *pOut++ = inA1;
+
+ }
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixAdd group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_f32.c
new file mode 100644
index 0000000..1e34ab7
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_f32.c
@@ -0,0 +1,85 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_init_f32.c
+*
+* Description: Floating-point matrix initialization.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixInit Matrix Initialization
+ *
+ * Initializes the underlying matrix data structure.
+ * The functions set the numRows,
+ * numCols, and pData fields
+ * of the matrix data structure.
+ */
+
+/**
+ * @addtogroup MatrixInit
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix initialization.
+ * @param[in,out] *S points to an instance of the floating-point matrix structure.
+ * @param[in] nRows number of rows in the matrix.
+ * @param[in] nColumns number of columns in the matrix.
+ * @param[in] *pData points to the matrix data array.
+ * @return none
+ */
+
+void arm_mat_init_f32(
+ arm_matrix_instance_f32 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ float32_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ * @} end of MatrixInit group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q15.c
new file mode 100644
index 0000000..c6d893e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q15.c
@@ -0,0 +1,77 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_init_q15.c
+*
+* Description: Q15 matrix initialization.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------------- */
+
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixInit
+ * @{
+ */
+
+ /**
+ * @brief Q15 matrix initialization.
+ * @param[in,out] *S points to an instance of the floating-point matrix structure.
+ * @param[in] nRows number of rows in the matrix.
+ * @param[in] nColumns number of columns in the matrix.
+ * @param[in] *pData points to the matrix data array.
+ * @return none
+ */
+
+void arm_mat_init_q15(
+ arm_matrix_instance_q15 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ q15_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ * @} end of MatrixInit group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q31.c
new file mode 100644
index 0000000..9755aef
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_init_q31.c
@@ -0,0 +1,81 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_init_q31.c
+*
+* Description: Q31 matrix initialization.
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------------- */
+
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixInit Matrix Initialization
+ *
+ */
+
+/**
+ * @addtogroup MatrixInit
+ * @{
+ */
+
+ /**
+ * @brief Q31 matrix initialization.
+ * @param[in,out] *S points to an instance of the floating-point matrix structure.
+ * @param[in] nRows number of rows in the matrix.
+ * @param[in] nColumns number of columns in the matrix.
+ * @param[in] *pData points to the matrix data array.
+ * @return none
+ */
+
+void arm_mat_init_q31(
+ arm_matrix_instance_q31 * S,
+ uint16_t nRows,
+ uint16_t nColumns,
+ q31_t * pData)
+{
+ /* Assign Number of Rows */
+ S->numRows = nRows;
+
+ /* Assign Number of Columns */
+ S->numCols = nColumns;
+
+ /* Assign Data pointer */
+ S->pData = pData;
+}
+
+/**
+ * @} end of MatrixInit group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_inverse_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_inverse_f32.c
new file mode 100644
index 0000000..f8d0f7b
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_inverse_f32.c
@@ -0,0 +1,667 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_inverse_f32.c
+*
+* Description: Floating-point matrix inverse.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixInv Matrix Inverse
+ *
+ * Computes the inverse of a matrix.
+ *
+ * The inverse is defined only if the input matrix is square and non-singular (the determinant
+ * is non-zero). The function checks that the input and output matrices are square and of the
+ * same size.
+ *
+ * Matrix inversion is numerically sensitive and the CMSIS DSP library only supports matrix
+ * inversion of floating-point matrices.
+ *
+ * \par Algorithm
+ * The Gauss-Jordan method is used to find the inverse.
+ * The algorithm performs a sequence of elementary row-operations till it
+ * reduces the input matrix to an identity matrix. Applying the same sequence
+ * of elementary row-operations to an identity matrix yields the inverse matrix.
+ * If the input matrix is singular, then the algorithm terminates and returns error status
+ * ARM_MATH_SINGULAR.
+ * \image html MatrixInverse.gif "Matrix Inverse of a 3 x 3 matrix using Gauss-Jordan Method"
+ */
+
+/**
+ * @addtogroup MatrixInv
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix inverse.
+ * @param[in] *pSrc points to input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns
+ * ARM_MATH_SIZE_MISMATCH if the input matrix is not square or if the size
+ * of the output matrix does not match the size of the input matrix.
+ * If the input matrix is found to be singular (non-invertible), then the function returns
+ * ARM_MATH_SINGULAR. Otherwise, the function returns ARM_MATH_SUCCESS.
+ */
+
+arm_status arm_mat_inverse_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *pInT1, *pInT2; /* Temporary input data matrix pointer */
+ float32_t *pInT3, *pInT4; /* Temporary output data matrix pointer */
+ float32_t *pPivotRowIn, *pPRT_in, *pPivotRowDst, *pPRT_pDst; /* Temporary input and output data matrix pointer */
+ uint32_t numRows = pSrc->numRows; /* Number of rows in the matrix */
+ uint32_t numCols = pSrc->numCols; /* Number of Cols in the matrix */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t Xchg, in = 0.0f, in1; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0u, j, loopCnt, k, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+ || (pSrc->numRows != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _
+ * | a11 a12 | 1 0 | | X11 X12 |
+ * | | | = | |
+ * |_ a21 a22 | 0 1 _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, pSrc).
+ * Therefore, the matrix to the right of the bar is our solution(pDst matrix, pDst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pInT2 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while(rowCnt > 0u)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while(j > 0u)
+ {
+ *pInT2++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pInT2++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1u;
+ while(j > 0u)
+ {
+ *pInT2++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement the loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0u;
+
+ while(loopCnt > 0u)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pInT3 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Destination pointer modifier */
+ k = 1u;
+
+ /* Check if the pivot element is zero */
+ if(*pInT1 == 0.0f)
+ {
+ /* Loop over the number rows present below */
+ i = numRows - (l + 1u);
+
+ while(i > 0u)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * l);
+ pInT4 = pInT3 + (numCols * k);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if(*pInT2 != 0.0f)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = numCols - l;
+
+ while(j > 0u)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while(j > 0u)
+ {
+ /* Exchange the row elements of the destination matrix */
+ Xchg = *pInT4;
+ *pInT4++ = *pInT3;
+ *pInT3++ = Xchg;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1u;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+ /* Update the destination pointer modifier */
+ k++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if((flag != 1u) && (in == 0.0f))
+ {
+ status = ARM_MATH_SINGULAR;
+
+ break;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *(pIn + (l * numCols));
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ j = (numCols - l);
+
+ while(j > 0u)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ in1 = *pInT1;
+ *pInT1++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over number of columns of the destination matrix */
+ j = numCols;
+
+ while(j > 0u)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ in1 = *pInT2;
+ *pInT2++ = in1 / in;
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ /* index used to check for pivot element */
+ i = 0u;
+
+ /* Loop over number of rows */
+ /* to be replaced by the sum of that row and a multiple of row i */
+ k = numRows;
+
+ while(k > 0u)
+ {
+ /* Check for the pivot element */
+ if(i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ j = (numCols - l);
+
+ while(j > 0u)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT1;
+ *pInT1++ = in1 - (in * *pPRT_in++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ j = numCols;
+
+ while(j > 0u)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ in1 = *pInT2;
+ *pInT2++ = in1 - (in * *pPRT_pDst++);
+
+ /* Decrement the loop counter */
+ j--;
+ }
+
+ }
+
+ /* Increment the temporary input pointer */
+ pInT1 = pInT1 + l;
+
+ /* Decrement the loop counter */
+ k--;
+
+ /* Increment the pivot index */
+ i++;
+ }
+
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+
+ /* Increment the index modifier */
+ l++;
+ }
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t Xchg, in = 0.0f; /* Temporary input values */
+ uint32_t i, rowCnt, flag = 0u, j, loopCnt, k, l; /* loop counters */
+ arm_status status; /* status of matrix inverse */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pSrc->numCols) || (pDst->numRows != pDst->numCols)
+ || (pSrc->numRows != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+
+ /*--------------------------------------------------------------------------------------------------------------
+ * Matrix Inverse can be solved using elementary row operations.
+ *
+ * Gauss-Jordan Method:
+ *
+ * 1. First combine the identity matrix and the input matrix separated by a bar to form an
+ * augmented matrix as follows:
+ * _ _ _ _ _ _ _ _
+ * | | a11 a12 | | | 1 0 | | | X11 X12 |
+ * | | | | | | | = | |
+ * |_ |_ a21 a22 _| | |_0 1 _| _| |_ X21 X21 _|
+ *
+ * 2. In our implementation, pDst Matrix is used as identity matrix.
+ *
+ * 3. Begin with the first row. Let i = 1.
+ *
+ * 4. Check to see if the pivot for row i is zero.
+ * The pivot is the element of the main diagonal that is on the current row.
+ * For instance, if working with row i, then the pivot element is aii.
+ * If the pivot is zero, exchange that row with a row below it that does not
+ * contain a zero in column i. If this is not possible, then an inverse
+ * to that matrix does not exist.
+ *
+ * 5. Divide every element of row i by the pivot.
+ *
+ * 6. For every row below and row i, replace that row with the sum of that row and
+ * a multiple of row i so that each new element in column i below row i is zero.
+ *
+ * 7. Move to the next row and column and repeat steps 2 through 5 until you have zeros
+ * for every element below and above the main diagonal.
+ *
+ * 8. Now an identical matrix is formed to the left of the bar(input matrix, src).
+ * Therefore, the matrix to the right of the bar is our solution(dst matrix, dst).
+ *----------------------------------------------------------------------------------------------------------------*/
+
+ /* Working pointer for destination matrix */
+ pInT2 = pOut;
+
+ /* Loop over the number of rows */
+ rowCnt = numRows;
+
+ /* Making the destination matrix as identity matrix */
+ while(rowCnt > 0u)
+ {
+ /* Writing all zeroes in lower triangle of the destination matrix */
+ j = numRows - rowCnt;
+ while(j > 0u)
+ {
+ *pInT2++ = 0.0f;
+ j--;
+ }
+
+ /* Writing all ones in the diagonal of the destination matrix */
+ *pInT2++ = 1.0f;
+
+ /* Writing all zeroes in upper triangle of the destination matrix */
+ j = rowCnt - 1u;
+ while(j > 0u)
+ {
+ *pInT2++ = 0.0f;
+ j--;
+ }
+
+ /* Decrement the loop counter */
+ rowCnt--;
+ }
+
+ /* Loop over the number of columns of the input matrix.
+ All the elements in each column are processed by the row operations */
+ loopCnt = numCols;
+
+ /* Index modifier to navigate through the columns */
+ l = 0u;
+ //for(loopCnt = 0u; loopCnt < numCols; loopCnt++)
+ while(loopCnt > 0u)
+ {
+ /* Check if the pivot element is zero..
+ * If it is zero then interchange the row with non zero row below.
+ * If there is no non zero element to replace in the rows below,
+ * then the matrix is Singular. */
+
+ /* Working pointer for the input matrix that points
+ * to the pivot element of the particular row */
+ pInT1 = pIn + (l * numCols);
+
+ /* Working pointer for the destination matrix that points
+ * to the pivot element of the particular row */
+ pInT3 = pOut + (l * numCols);
+
+ /* Temporary variable to hold the pivot value */
+ in = *pInT1;
+
+ /* Destination pointer modifier */
+ k = 1u;
+
+ /* Check if the pivot element is zero */
+ if(*pInT1 == 0.0f)
+ {
+ /* Loop over the number rows present below */
+ for (i = (l + 1u); i < numRows; i++)
+ {
+ /* Update the input and destination pointers */
+ pInT2 = pInT1 + (numCols * l);
+ pInT4 = pInT3 + (numCols * k);
+
+ /* Check if there is a non zero pivot element to
+ * replace in the rows below */
+ if(*pInT2 != 0.0f)
+ {
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0u; j < (numCols - l); j++)
+ {
+ /* Exchange the row elements of the input matrix */
+ Xchg = *pInT2;
+ *pInT2++ = *pInT1;
+ *pInT1++ = Xchg;
+ }
+
+ for (j = 0u; j < numCols; j++)
+ {
+ Xchg = *pInT4;
+ *pInT4++ = *pInT3;
+ *pInT3++ = Xchg;
+ }
+
+ /* Flag to indicate whether exchange is done or not */
+ flag = 1u;
+
+ /* Break after exchange is done */
+ break;
+ }
+
+ /* Update the destination pointer modifier */
+ k++;
+ }
+ }
+
+ /* Update the status if the matrix is singular */
+ if((flag != 1u) && (in == 0.0f))
+ {
+ status = ARM_MATH_SINGULAR;
+
+ break;
+ }
+
+ /* Points to the pivot row of input and destination matrices */
+ pPivotRowIn = pIn + (l * numCols);
+ pPivotRowDst = pOut + (l * numCols);
+
+ /* Temporary pointers to the pivot row pointers */
+ pInT1 = pPivotRowIn;
+ pInT2 = pPivotRowDst;
+
+ /* Pivot element of the row */
+ in = *(pIn + (l * numCols));
+
+ /* Loop over number of columns
+ * to the right of the pilot element */
+ for (j = 0u; j < (numCols - l); j++)
+ {
+ /* Divide each element of the row of the input matrix
+ * by the pivot element */
+ *pInT1++ = *pInT1 / in;
+ }
+ for (j = 0u; j < numCols; j++)
+ {
+ /* Divide each element of the row of the destination matrix
+ * by the pivot element */
+ *pInT2++ = *pInT2 / in;
+ }
+
+ /* Replace the rows with the sum of that row and a multiple of row i
+ * so that each new element in column i above row i is zero.*/
+
+ /* Temporary pointers for input and destination matrices */
+ pInT1 = pIn;
+ pInT2 = pOut;
+
+ for (i = 0u; i < numRows; i++)
+ {
+ /* Check for the pivot element */
+ if(i == l)
+ {
+ /* If the processing element is the pivot element,
+ only the columns to the right are to be processed */
+ pInT1 += numCols - l;
+ pInT2 += numCols;
+ }
+ else
+ {
+ /* Element of the reference row */
+ in = *pInT1;
+
+ /* Working pointers for input and destination pivot rows */
+ pPRT_in = pPivotRowIn;
+ pPRT_pDst = pPivotRowDst;
+
+ /* Loop over the number of columns to the right of the pivot element,
+ to replace the elements in the input matrix */
+ for (j = 0u; j < (numCols - l); j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pInT1++ = *pInT1 - (in * *pPRT_in++);
+ }
+ /* Loop over the number of columns to
+ replace the elements in the destination matrix */
+ for (j = 0u; j < numCols; j++)
+ {
+ /* Replace the element by the sum of that row
+ and a multiple of the reference row */
+ *pInT2++ = *pInT2 - (in * *pPRT_pDst++);
+ }
+
+ }
+ /* Increment the temporary input pointer */
+ pInT1 = pInT1 + l;
+ }
+ /* Increment the input pointer */
+ pIn++;
+
+ /* Decrement the loop counter */
+ loopCnt--;
+ /* Increment the index modifier */
+ l++;
+ }
+
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+
+ if((flag != 1u) && (in == 0.0f))
+ {
+ status = ARM_MATH_SINGULAR;
+ }
+ }
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixInv group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_f32.c
new file mode 100644
index 0000000..ea5ab89
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_f32.c
@@ -0,0 +1,283 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_mult_f32.c
+*
+* Description: Floating-point matrix multiplication.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixMult Matrix Multiplication
+ *
+ * Multiplies two matrices.
+ *
+ * \image html MatrixMultiplication.gif "Multiplication of two 3 x 3 matrices"
+
+ * Matrix multiplication is only defined if the number of columns of the
+ * first matrix equals the number of rows of the second matrix.
+ * Multiplying an M x N matrix with an N x P matrix results
+ * in an M x P matrix.
+ * When matrix size checking is enabled, the functions check: (1) that the inner dimensions of
+ * pSrcA and pSrcB are equal; and (2) that the size of the output
+ * matrix equals the outer dimensions of pSrcA and pSrcB.
+ */
+
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix multiplication.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+arm_status arm_mat_mult_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ float32_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t in1, in2, in3, in4;
+ uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0u;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pIn1 = pInA;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2u;
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ in3 = *pIn2;
+ pIn2 += numColsB;
+ in1 = pIn1[0];
+ in2 = pIn1[1];
+ sum += in1 * in3;
+ in4 = *pIn2;
+ pIn2 += numColsB;
+ sum += in2 * in4;
+
+ in3 = *pIn2;
+ pIn2 += numColsB;
+ in1 = pIn1[2];
+ in2 = pIn1[3];
+ sum += in1 * in3;
+ in4 = *pIn2;
+ pIn2 += numColsB;
+ sum += in2 * in4;
+ pIn1 += 4u;
+
+ /* Decrement the loop count */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining MACs here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4u;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ sum += *pIn1++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while(col > 0u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pInA with each column in pInB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0.0f;
+
+ /* Initialize the pointer pIn1 to point to the starting address of the row being processed */
+ pIn1 = pInA;
+
+ /* Matrix A columns number of MAC operations are to be performed */
+ colCnt = numColsA;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ sum += *pIn1++ * (*pIn2);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Store the result in the destination buffer */
+ *px++ = sum;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while(col > 0u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
new file mode 100644
index 0000000..a55f19c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q15.c
@@ -0,0 +1,360 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_mult_fast_q15.c
+*
+* Description: Q15 matrix multiplication (fast variant)
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+
+/**
+ * @brief Q15 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @param[in] *pState points to the array for storing intermediate results
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The difference between the function arm_mat_mult_q15() and this fast variant is that
+ * the fast variant use a 32-bit rather than a 64-bit accumulator.
+ * The result of each 1.15 x 1.15 multiplication is truncated to
+ * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ * format. Finally, the accumulator is saturated and converted to a 1.15 result.
+ *
+ * \par
+ * The fast version has the same overflow behavior as the standard version but provides
+ * less precision since it discards the low 16 bits of each multiplication result.
+ * In order to avoid overflows completely the input signals must be scaled down.
+ * Scale down one of the input matrices by log2(numColsA) bits to
+ * avoid overflows, as a total of numColsA additions are computed internally for each
+ * output element.
+ *
+ * \par
+ * See arm_mat_mult_q15() for a slower implementation of this function
+ * which uses 64-bit accumulation to provide higher precision.
+ */
+
+arm_status arm_mat_mult_fast_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
+{
+ q31_t sum; /* accumulator */
+ q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t inA1, inA2, inB1, inB2;
+
+#else
+
+ q15_t in; /* Temporary variable to hold the input value */
+ q15_t inA1, inA2, inB1, inB2;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /* Matrix transpose */
+ do
+ {
+ /* Apply loop unrolling and exchange the columns with row elements */
+ col = numColsB >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pSrcBT + i;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(col > 0u)
+ {
+#ifndef UNALIGNED_SUPPORT_DISABLE
+ /* Read two elements from the row */
+ in = *__SIMD32(pInB)++;
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) in;
+
+#else
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store the second element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *px = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read two elements from the row */
+ in = *__SIMD32(pInB)++;
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) in;
+
+#else
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store the second element in the destination */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *px = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ col = numColsB % 0x4u;
+
+ while(col > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pInB++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* Reset the variables for the usage in the following multiplication process */
+ row = numRowsA;
+ i = 0u;
+ px = pDst->pData;
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the transposed pSrcB data */
+ pInB = pSrcBT;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pInA = pSrcA->pData + i;
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ inA1 = *__SIMD32(pInA)++;
+ inB1 = *__SIMD32(pInB)++;
+ inA2 = *__SIMD32(pInA)++;
+ inB2 = *__SIMD32(pInB)++;
+
+ sum = __SMLAD(inA1, inB1, sum);
+ sum = __SMLAD(inA2, inB2, sum);
+
+#else
+
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ inA2 = *pInA++;
+ sum += inA1 * inB1;
+ inB2 = *pInB++;
+
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ sum += inA2 * inB2;
+ inA2 = *pInA++;
+ inB2 = *pInB++;
+
+ sum += inA1 * inB1;
+ sum += inA2 * inB2;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* process odd column samples */
+ colCnt = numColsA % 0x4u;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ sum += (q31_t) (*pInA++) * (*pInB++);
+
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px = (q15_t) (sum >> 15);
+ px++;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while(col > 0u);
+
+ i = i + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
new file mode 100644
index 0000000..9f02738
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_fast_q31.c
@@ -0,0 +1,217 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_mult_fast_q31.c
+*
+* Description: Q31 matrix multiplication (fast variant).
+*
+* Target Processor: Cortex-M4/Cortex-M3
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix multiplication (fast variant) for Cortex-M3 and Cortex-M4
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The difference between the function arm_mat_mult_q31() and this fast variant is that
+ * the fast variant use a 32-bit rather than a 64-bit accumulator.
+ * The result of each 1.31 x 1.31 multiplication is truncated to
+ * 2.30 format. These intermediate results are accumulated in a 32-bit register in 2.30
+ * format. Finally, the accumulator is saturated and converted to a 1.31 result.
+ *
+ * \par
+ * The fast version has the same overflow behavior as the standard version but provides
+ * less precision since it discards the low 32 bits of each multiplication result.
+ * In order to avoid overflows completely the input signals must be scaled down.
+ * Scale down one of the input matrices by log2(numColsA) bits to
+ * avoid overflows, as a total of numColsA additions are computed internally for each
+ * output element.
+ *
+ * \par
+ * See arm_mat_mult_q31() for a slower implementation of this function
+ * which uses 64-bit accumulation to provide higher precision.
+ */
+
+arm_status arm_mat_mult_fast_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+// q31_t *pSrcB = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ q31_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ q31_t inA1, inA2, inA3, inA4, inB1, inB2, inB3, inB4;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0u;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate the pointer pIn1 to point to the starting address of pInA */
+ pIn1 = pInA;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ inB1 = *pIn2;
+ pIn2 += numColsB;
+
+ inA1 = pIn1[0];
+ inA2 = pIn1[1];
+
+ inB2 = *pIn2;
+ pIn2 += numColsB;
+
+ inB3 = *pIn2;
+ pIn2 += numColsB;
+
+ sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA1 * inB1)) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA2 * inB2)) >> 32);
+
+ inA3 = pIn1[2];
+ inA4 = pIn1[3];
+
+ inB4 = *pIn2;
+ pIn2 += numColsB;
+
+ sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA3 * inB3)) >> 32);
+ sum = (q31_t) ((((q63_t) sum << 32) + ((q63_t) inA4 * inB4)) >> 32);
+
+ pIn1 += 4u;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4u;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ sum = (q31_t) ((((q63_t) sum << 32) +
+ ((q63_t) * pIn1++ * (*pIn2))) >> 32);
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 2.30 to 1.31 format and store in destination buffer */
+ *px++ = sum << 1;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = pSrcB->pData + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while(col > 0u);
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q15.c
new file mode 100644
index 0000000..ba9d66c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q15.c
@@ -0,0 +1,466 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_mult_q15.c
+*
+* Description: Q15 matrix multiplication.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+
+/**
+ * @brief Q15 matrix multiplication
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @param[in] *pState points to the array for storing intermediate results
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator. The inputs to the
+ * multiplications are in 1.15 format and multiplications yield a 2.30 result.
+ * The 2.30 intermediate
+ * results are accumulated in a 64-bit accumulator in 34.30 format. This approach
+ * provides 33 guard bits and there is no risk of overflow. The 34.30 result is then
+ * truncated to 34.15 format by discarding the low 15 bits and then saturated to
+ * 1.15 format.
+ *
+ * \par
+ * Refer to arm_mat_mult_fast_q15() for a faster but less precise version of this function for Cortex-M3 and Cortex-M4.
+ *
+ */
+
+arm_status arm_mat_mult_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst,
+ q15_t * pState)
+{
+ q63_t sum; /* accumulator */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q15_t *pSrcBT = pState; /* input data matrix pointer for transpose */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsB = pSrcB->numRows; /* number of rows of input matrix A */
+ uint16_t col, i = 0u, row = numRowsB, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ q31_t in; /* Temporary variable to hold the input value */
+ q31_t pSourceA1, pSourceB1, pSourceA2, pSourceB2;
+
+#else
+
+ q15_t in; /* Temporary variable to hold the input value */
+ q15_t inA1, inB1, inA2, inB2;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* Matrix transpose */
+ do
+ {
+ /* Apply loop unrolling and exchange the columns with row elements */
+ col = numColsB >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pSrcBT + i;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(col > 0u)
+ {
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read two elements from the row */
+ in = *__SIMD32(pInB)++;
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) in;
+
+#else
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store the second element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *px = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read two elements from the row */
+ in = *__SIMD32(pInB)++;
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) in;
+
+#else
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Unpack and store the second element in the destination */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *px = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *px = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+#else
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Read one element from the row */
+ in = *pInB++;
+
+ /* Store one element in the destination */
+ *px = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ /* If the columns of pSrcB is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ col = numColsB % 0x4u;
+
+ while(col > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pInB++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += numRowsB;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* Reset the variables for the usage in the following multiplication process */
+ row = numRowsA;
+ i = 0u;
+ px = pDst->pData;
+
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the transposed pSrcB data */
+ pInB = pSrcBT;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Apply loop unrolling and compute 2 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+ /* Initiate the pointer pIn1 to point to the starting address of the column being processed */
+ pInA = pSrcA->pData + i;
+
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ pSourceA1 = *__SIMD32(pInA)++;
+ pSourceB1 = *__SIMD32(pInB)++;
+
+ pSourceA2 = *__SIMD32(pInA)++;
+ pSourceB2 = *__SIMD32(pInB)++;
+
+ /* Multiply and Accumlates */
+ sum = __SMLALD(pSourceA1, pSourceB1, sum);
+ sum = __SMLALD(pSourceA2, pSourceB2, sum);
+
+#else
+ /* read real and imag values from pSrcA and pSrcB buffer */
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ inA2 = *pInA++;
+ /* Multiply and Accumlates */
+ sum += inA1 * inB1;
+ inB2 = *pInB++;
+
+ inA1 = *pInA++;
+ inB1 = *pInB++;
+ /* Multiply and Accumlates */
+ sum += inA2 * inB2;
+ inA2 = *pInA++;
+ inB2 = *pInB++;
+
+ /* Multiply and Accumlates */
+ sum += inA1 * inB1;
+ sum += inA2 * inB2;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* process remaining column samples */
+ colCnt = numColsA & 3u;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ sum += *pInA++ * *pInB++;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Saturate and store the result in the destination buffer */
+ *px = (q15_t) (__SSAT((sum >> 15), 16));
+ px++;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while(col > 0u);
+
+ i = i + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A of Q15 type */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B of Q15 type */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ q15_t *px; /* Temporary output data matrix pointer */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate the pointer pIn1 to point to the starting address of pSrcA */
+ pIn1 = pInA;
+
+ /* Matrix A columns number of MAC operations are to be performed */
+ colCnt = numColsA;
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ sum += (q31_t) * pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 34.30 to 1.15 format and store the saturated value in destination buffer */
+ /* Saturate and store the result in the destination buffer */
+ *px++ = (q15_t) __SSAT((sum >> 15), 16);
+
+ /* Decrement the column loop counter */
+ col--;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while(col > 0u);
+
+ /* Update the pointer pSrcA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c
new file mode 100644
index 0000000..315cf9a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_mult_q31.c
@@ -0,0 +1,291 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_mult_q31.c
+*
+* Description: Q31 matrix multiplication.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixMult
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix multiplication
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The accumulator has a 2.62 format and maintains full precision of the intermediate
+ * multiplication results but provides only a single guard bit. There is no saturation
+ * on intermediate additions. Thus, if the accumulator overflows it wraps around and
+ * distorts the result. The input signals should be scaled down to avoid intermediate
+ * overflows. The input is thus scaled down by log2(numColsA) bits
+ * to avoid overflows, as a total of numColsA additions are performed internally.
+ * The 2.62 accumulator is right shifted by 31 bits and saturated to 1.31 format to yield the final result.
+ *
+ * \par
+ * See arm_mat_mult_fast_q31() for a faster but less precise implementation of this function for Cortex-M3 and Cortex-M4.
+ *
+ */
+
+arm_status arm_mat_mult_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ q63_t sum; /* Accumulator */
+ uint16_t numRowsA = pSrcA->numRows; /* number of rows of input matrix A */
+ uint16_t numColsB = pSrcB->numCols; /* number of columns of input matrix B */
+ uint16_t numColsA = pSrcA->numCols; /* number of columns of input matrix A */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ uint16_t col, i = 0u, j, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+ q31_t a0, a1, a2, a3, b0, b1, b2, b3;
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ j = 0u;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate the pointer pIn1 to point to the starting address of pInA */
+ pIn1 = pInA;
+
+ /* Apply loop unrolling and compute 4 MACs simultaneously. */
+ colCnt = numColsA >> 2;
+
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ b0 = *pIn2;
+ pIn2 += numColsB;
+
+ a0 = *pIn1++;
+ a1 = *pIn1++;
+
+ b1 = *pIn2;
+ pIn2 += numColsB;
+ b2 = *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) a0 *b0;
+ sum += (q63_t) a1 *b1;
+
+ a2 = *pIn1++;
+ a3 = *pIn1++;
+
+ b3 = *pIn2;
+ pIn2 += numColsB;
+
+ sum += (q63_t) a2 *b2;
+ sum += (q63_t) a3 *b3;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* If the columns of pSrcA is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ colCnt = numColsA % 0x4u;
+
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
+ *px++ = (q31_t) (sum >> 31);
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ j++;
+ pIn2 = (pSrcB->pData) + j;
+
+ /* Decrement the column loop counter */
+ col--;
+
+ } while(col > 0u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q31_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ uint16_t col, i = 0u, row = numRowsA, colCnt; /* loop counters */
+ arm_status status; /* status of matrix multiplication */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numCols != pSrcB->numRows) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcB->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* The following loop performs the dot-product of each row in pSrcA with each column in pSrcB */
+ /* row loop */
+ do
+ {
+ /* Output pointer is set to starting address of the row being processed */
+ px = pOut + i;
+
+ /* For every row wise process, the column loop counter is to be initiated */
+ col = numColsB;
+
+ /* For every row wise process, the pIn2 pointer is set
+ ** to the starting address of the pSrcB data */
+ pIn2 = pSrcB->pData;
+
+ /* column loop */
+ do
+ {
+ /* Set the variable sum, that acts as accumulator, to zero */
+ sum = 0;
+
+ /* Initiate the pointer pIn1 to point to the starting address of pInA */
+ pIn1 = pInA;
+
+ /* Matrix A columns number of MAC operations are to be performed */
+ colCnt = numColsA;
+
+ /* matrix multiplication */
+ while(colCnt > 0u)
+ {
+ /* c(m,n) = a(1,1)*b(1,1) + a(1,2) * b(2,1) + .... + a(m,p)*b(p,n) */
+ /* Perform the multiply-accumulates */
+ sum += (q63_t) * pIn1++ * *pIn2;
+ pIn2 += numColsB;
+
+ /* Decrement the loop counter */
+ colCnt--;
+ }
+
+ /* Convert the result from 2.62 to 1.31 format and store in destination buffer */
+ *px++ = (q31_t) (sum >> 31);
+
+ /* Decrement the column loop counter */
+ col--;
+
+ /* Update the pointer pIn2 to point to the starting address of the next column */
+ pIn2 = pInB + (numColsB - col);
+
+ } while(col > 0u);
+
+#endif
+
+ /* Update the pointer pInA to point to the starting address of the next row */
+ i = i + numColsB;
+ pInA = pInA + numColsA;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixMult group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_f32.c
new file mode 100644
index 0000000..80de6b2
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_f32.c
@@ -0,0 +1,178 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_scale_f32.c
+*
+* Description: Multiplies a floating-point matrix by a scalar.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixScale Matrix Scale
+ *
+ * Multiplies a matrix by a scalar. This is accomplished by multiplying each element in the
+ * matrix by the scalar. For example:
+ * \image html MatrixScale.gif "Matrix Scaling of a 3 x 3 matrix"
+ *
+ * The function checks to make sure that the input and output matrices are of the same size.
+ *
+ * In the fixed-point Q15 and Q31 functions, scale is represented by
+ * a fractional multiplication scaleFract and an arithmetic shift shift.
+ * The shift allows the gain of the scaling operation to exceed 1.0.
+ * The overall scale factor applied to the fixed-point data is
+ *
+ * scale = scaleFract * 2^shift.
+ *
+ */
+
+/**
+ * @addtogroup MatrixScale
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix scaling.
+ * @param[in] *pSrc points to input matrix structure
+ * @param[in] scale scale factor to be applied
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either ARM_MATH_SIZE_MISMATCH
+ * or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ */
+
+arm_status arm_mat_scale_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ float32_t scale,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix scaling */
+
+#ifndef ARM_MATH_CM0
+
+ float32_t in1, in2, in3, in4; /* temporary variables */
+ float32_t out1, out2, out3, out4; /* temporary variables */
+
+#endif // #ifndef ARM_MATH_CM0
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * scale */
+ /* Scaling and results are stored in the destination buffer. */
+ in1 = pIn[0];
+ in2 = pIn[1];
+ in3 = pIn[2];
+ in4 = pIn[3];
+
+ out1 = in1 * scale;
+ out2 = in2 * scale;
+ out3 = in3 * scale;
+ out4 = in4 * scale;
+
+
+ pOut[0] = out1;
+ pOut[1] = out2;
+ pOut[2] = out3;
+ pOut[3] = out4;
+
+ /* update pointers to process next sampels */
+ pIn += 4u;
+ pOut += 4u;
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * scale */
+ /* The results are stored in the destination buffer. */
+ *pOut++ = (*pIn++) * scale;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixScale group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q15.c
new file mode 100644
index 0000000..0978112
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q15.c
@@ -0,0 +1,180 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_scale_q15.c
+*
+* Description: Multiplies a Q15 matrix by a scalar.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixScale
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix scaling.
+ * @param[in] *pSrc points to input matrix
+ * @param[in] scaleFract fractional portion of the scale factor
+ * @param[in] shift number of bits to shift the result by
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The input data *pSrc and scaleFract are in 1.15 format.
+ * These are multiplied to yield a 2.30 intermediate result and this is shifted with saturation to 1.15 format.
+ */
+
+arm_status arm_mat_scale_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ q15_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ int32_t totShift = 15 - shift; /* total shift to apply after scaling */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix scaling */
+
+#ifndef ARM_MATH_CM0
+
+ q15_t in1, in2, in3, in4;
+ q31_t out1, out2, out3, out4;
+ q31_t inA1, inA2;
+
+#endif // #ifndef ARM_MATH_CM0
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch */
+ if((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif // #ifdef ARM_MATH_MATRIX_CHECK
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * k */
+ /* Scale, saturate and then store the results in the destination buffer. */
+ /* Reading 2 inputs from memory */
+ inA1 = _SIMD32_OFFSET(pIn);
+ inA2 = _SIMD32_OFFSET(pIn + 2);
+
+ /* C = A * scale */
+ /* Scale the inputs and then store the 2 results in the destination buffer
+ * in single cycle by packing the outputs */
+ out1 = (q31_t) ((q15_t) (inA1 >> 16) * scaleFract);
+ out2 = (q31_t) ((q15_t) inA1 * scaleFract);
+ out3 = (q31_t) ((q15_t) (inA2 >> 16) * scaleFract);
+ out4 = (q31_t) ((q15_t) inA2 * scaleFract);
+
+ out1 = out1 >> totShift;
+ inA1 = _SIMD32_OFFSET(pIn + 4);
+ out2 = out2 >> totShift;
+ inA2 = _SIMD32_OFFSET(pIn + 6);
+ out3 = out3 >> totShift;
+ out4 = out4 >> totShift;
+
+ in1 = (q15_t) (__SSAT(out1, 16));
+ in2 = (q15_t) (__SSAT(out2, 16));
+ in3 = (q15_t) (__SSAT(out3, 16));
+ in4 = (q15_t) (__SSAT(out4, 16));
+
+ _SIMD32_OFFSET(pOut) = __PKHBT(in2, in1, 16);
+ _SIMD32_OFFSET(pOut + 2) = __PKHBT(in4, in3, 16);
+
+ /* update pointers to process next sampels */
+ pIn += 4u;
+ pOut += 4u;
+
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * k */
+ /* Scale, saturate and then store the results in the destination buffer. */
+ *pOut++ =
+ (q15_t) (__SSAT(((q31_t) (*pIn++) * scaleFract) >> totShift, 16));
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixScale group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q31.c
new file mode 100644
index 0000000..970be0e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_scale_q31.c
@@ -0,0 +1,200 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_scale_q31.c
+*
+* Description: Multiplies a Q31 matrix by a scalar.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixScale
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix scaling.
+ * @param[in] *pSrc points to input matrix
+ * @param[in] scaleFract fractional portion of the scale factor
+ * @param[in] shift number of bits to shift the result by
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The input data *pSrc and scaleFract are in 1.31 format.
+ * These are multiplied to yield a 2.62 intermediate result and this is shifted with saturation to 1.31 format.
+ */
+
+arm_status arm_mat_scale_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ q31_t scaleFract,
+ int32_t shift,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ int32_t totShift = shift + 1; /* shift to apply after scaling */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix scaling */
+ q31_t in1, in2, out1; /* temporary variabels */
+
+#ifndef ARM_MATH_CM0
+
+ q31_t in3, in4, out2, out3, out4; /* temporary variables */
+
+#endif // #ifndef ARM_MAT_CM0
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch */
+ if((pSrc->numRows != pDst->numRows) || (pSrc->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif // #ifdef ARM_MATH_MATRIX_CHECK
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrc->numRows * pSrc->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * k */
+ /* Read values from input */
+ in1 = *pIn;
+ in2 = *(pIn + 1);
+ in3 = *(pIn + 2);
+ in4 = *(pIn + 3);
+
+ /* multiply input with scaler value */
+ in1 = ((q63_t) in1 * scaleFract) >> 32;
+ in2 = ((q63_t) in2 * scaleFract) >> 32;
+ in3 = ((q63_t) in3 * scaleFract) >> 32;
+ in4 = ((q63_t) in4 * scaleFract) >> 32;
+
+ /* apply shifting */
+ out1 = in1 << totShift;
+ out2 = in2 << totShift;
+
+ /* saturate the results. */
+ if(in1 != (out1 >> totShift))
+ out1 = 0x7FFFFFFF ^ (in1 >> 31);
+
+ if(in2 != (out2 >> totShift))
+ out2 = 0x7FFFFFFF ^ (in2 >> 31);
+
+ out3 = in3 << totShift;
+ out4 = in4 << totShift;
+
+ *pOut = out1;
+ *(pOut + 1) = out2;
+
+ if(in3 != (out3 >> totShift))
+ out3 = 0x7FFFFFFF ^ (in3 >> 31);
+
+ if(in4 != (out4 >> totShift))
+ out4 = 0x7FFFFFFF ^ (in4 >> 31);
+
+
+ *(pOut + 2) = out3;
+ *(pOut + 3) = out4;
+
+ /* update pointers to process next sampels */
+ pIn += 4u;
+ pOut += 4u;
+
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) * k */
+ /* Scale, saturate and then store the results in the destination buffer. */
+ in1 = *pIn++;
+
+ in2 = ((q63_t) in1 * scaleFract) >> 32;
+
+ out1 = in2 << totShift;
+
+ if(in2 != (out1 >> totShift))
+ out1 = 0x7FFFFFFF ^ (in2 >> 31);
+
+ *pOut++ = out1;
+
+ /* Decrement the numSamples loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixScale group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_f32.c
new file mode 100644
index 0000000..7b27753
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_f32.c
@@ -0,0 +1,206 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_sub_f32.c
+*
+* Description: Floating-point matrix subtraction.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @defgroup MatrixSub Matrix Subtraction
+ *
+ * Subtract two matrices.
+ * \image html MatrixSubtraction.gif "Subraction of two 3 x 3 matrices"
+ *
+ * The functions check to make sure that
+ * pSrcA, pSrcB, and pDst have the same
+ * number of rows and columns.
+ */
+
+/**
+ * @addtogroup MatrixSub
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix subtraction
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+arm_status arm_mat_sub_f32(
+ const arm_matrix_instance_f32 * pSrcA,
+ const arm_matrix_instance_f32 * pSrcB,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ float32_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+
+#ifndef ARM_MATH_CM0
+
+ float32_t inA1, inA2, inB1, inB2, out1, out2; /* temporary variables */
+
+#endif // #ifndef ARM_MATH_CM0
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ /* Read values from source A */
+ inA1 = pIn1[0];
+
+ /* Read values from source B */
+ inB1 = pIn2[0];
+
+ /* Read values from source A */
+ inA2 = pIn1[1];
+
+ /* out = sourceA - sourceB */
+ out1 = inA1 - inB1;
+
+ /* Read values from source B */
+ inB2 = pIn2[1];
+
+ /* Read values from source A */
+ inA1 = pIn1[2];
+
+ /* out = sourceA - sourceB */
+ out2 = inA2 - inB2;
+
+ /* Read values from source B */
+ inB1 = pIn2[2];
+
+ /* Store result in destination */
+ pOut[0] = out1;
+ pOut[1] = out2;
+
+ /* Read values from source A */
+ inA2 = pIn1[3];
+
+ /* Read values from source B */
+ inB2 = pIn2[3];
+
+ /* out = sourceA - sourceB */
+ out1 = inA1 - inB1;
+
+
+ /* out = sourceA - sourceB */
+ out2 = inA2 - inB2;
+
+ /* Store result in destination */
+ pOut[2] = out1;
+
+ /* Store result in destination */
+ pOut[3] = out2;
+
+
+ /* update pointers to process next sampels */
+ pIn1 += 4u;
+ pIn2 += 4u;
+ pOut += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ *pOut++ = (*pIn1++) - (*pIn2++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixSub group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q15.c
new file mode 100644
index 0000000..38e67f4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q15.c
@@ -0,0 +1,157 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_sub_q15.c
+*
+* Description: Q15 Matrix subtraction
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixSub
+ * @{
+ */
+
+/**
+ * @brief Q15 matrix subtraction.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ */
+
+arm_status arm_mat_sub_q15(
+ const arm_matrix_instance_q15 * pSrcA,
+ const arm_matrix_instance_q15 * pSrcB,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pInA = pSrcA->pData; /* input data matrix pointer A */
+ q15_t *pInB = pSrcB->pData; /* input data matrix pointer B */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Apply loop unrolling */
+ blkCnt = numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract, Saturate and then store the results in the destination buffer. */
+ *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
+ *__SIMD32(pOut)++ = __QSUB16(*__SIMD32(pInA)++, *__SIMD32(pInB)++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ *pOut++ = (q15_t) __QSUB16(*pInA++, *pInB++);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract and then store the results in the destination buffer. */
+ *pOut++ = (q15_t) __SSAT(((q31_t) * pInA++ - *pInB++), 16);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixSub group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q31.c
new file mode 100644
index 0000000..c4924cc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_sub_q31.c
@@ -0,0 +1,205 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_sub_q31.c
+*
+* Description: Q31 matrix subtraction
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixSub
+ * @{
+ */
+
+/**
+ * @brief Q31 matrix subtraction.
+ * @param[in] *pSrcA points to the first input matrix structure
+ * @param[in] *pSrcB points to the second input matrix structure
+ * @param[out] *pDst points to output matrix structure
+ * @return The function returns either
+ * ARM_MATH_SIZE_MISMATCH or ARM_MATH_SUCCESS based on the outcome of size checking.
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q31 range [0x80000000 0x7FFFFFFF] will be saturated.
+ */
+
+
+arm_status arm_mat_sub_q31(
+ const arm_matrix_instance_q31 * pSrcA,
+ const arm_matrix_instance_q31 * pSrcB,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn1 = pSrcA->pData; /* input data matrix pointer A */
+ q31_t *pIn2 = pSrcB->pData; /* input data matrix pointer B */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t inA1, inB1; /* temporary variables */
+
+#ifndef ARM_MATH_CM0
+
+ q31_t inA2, inB2; /* temporary variables */
+ q31_t out1, out2; /* temporary variables */
+
+#endif // #ifndef ARM_MATH_CM0
+
+ uint32_t numSamples; /* total number of elements in the matrix */
+ uint32_t blkCnt; /* loop counters */
+ arm_status status; /* status of matrix subtraction */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+ /* Check for matrix mismatch condition */
+ if((pSrcA->numRows != pSrcB->numRows) ||
+ (pSrcA->numCols != pSrcB->numCols) ||
+ (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif
+ {
+ /* Total number of samples in the input matrix */
+ numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Loop Unrolling */
+ blkCnt = numSamples >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract, saturate and then store the results in the destination buffer. */
+ /* Read values from source A */
+ inA1 = pIn1[0];
+
+ /* Read values from source B */
+ inB1 = pIn2[0];
+
+ /* Read values from source A */
+ inA2 = pIn1[1];
+
+ /* Subtract and saturate */
+ out1 = __QSUB(inA1, inB1);
+
+ /* Read values from source B */
+ inB2 = pIn2[1];
+
+ /* Read values from source A */
+ inA1 = pIn1[2];
+
+ /* Subtract and saturate */
+ out2 = __QSUB(inA2, inB2);
+
+ /* Read values from source B */
+ inB1 = pIn2[2];
+
+ /* Store result in destination */
+ pOut[0] = out1;
+ pOut[1] = out2;
+
+ /* Read values from source A */
+ inA2 = pIn1[3];
+
+ /* Read values from source B */
+ inB2 = pIn2[3];
+
+ /* Subtract and saturate */
+ out1 = __QSUB(inA1, inB1);
+
+ /* Subtract and saturate */
+ out2 = __QSUB(inA2, inB2);
+
+ /* Store result in destination */
+ pOut[2] = out1;
+ pOut[3] = out2;
+
+ /* update pointers to process next samples */
+ pIn1 += 4u;
+ pIn2 += 4u;
+ pOut += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = numSamples % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initialize blkCnt with number of samples */
+ blkCnt = numSamples;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C(m,n) = A(m,n) - B(m,n) */
+ /* Subtract, saturate and then store the results in the destination buffer. */
+ inA1 = *pIn1++;
+ inB1 = *pIn2++;
+
+ inA1 = __QSUB(inA1, inB1);
+
+ *pOut++ = inA1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixSub group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_f32.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_f32.c
new file mode 100644
index 0000000..833d28e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_f32.c
@@ -0,0 +1,215 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_trans_f32.c
+*
+* Description: Floating-point matrix transpose.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+/**
+ * @defgroup MatrixTrans Matrix Transpose
+ *
+ * Tranposes a matrix.
+ * Transposing an M x N matrix flips it around the center diagonal and results in an N x M matrix.
+ * \image html MatrixTranspose.gif "Transpose of a 3 x 3 matrix"
+ */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixTrans
+ * @{
+ */
+
+/**
+ * @brief Floating-point matrix transpose.
+ * @param[in] *pSrc points to the input matrix
+ * @param[out] *pDst points to the output matrix
+ * @return The function returns either ARM_MATH_SIZE_MISMATCH
+ * or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+
+arm_status arm_mat_trans_f32(
+ const arm_matrix_instance_f32 * pSrc,
+ arm_matrix_instance_f32 * pDst)
+{
+ float32_t *pIn = pSrc->pData; /* input data matrix pointer */
+ float32_t *pOut = pDst->pData; /* output data matrix pointer */
+ float32_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of rows */
+ uint16_t nColumns = pSrc->numCols; /* number of columns */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ uint16_t blkCnt, i = 0u, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Loop Unrolling */
+ blkCnt = nColumns >> 2;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u) /* column loop */
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ blkCnt = nColumns % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ uint16_t col, i = 0u, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while(col > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u); /* row loop end */
+
+ /* Set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q15.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q15.c
new file mode 100644
index 0000000..f936bb6
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q15.c
@@ -0,0 +1,281 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_trans_q15.c
+*
+* Description: Q15 matrix transpose.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixTrans
+ * @{
+ */
+
+/*
+ * @brief Q15 matrix transpose.
+ * @param[in] *pSrc points to the input matrix
+ * @param[out] *pDst points to the output matrix
+ * @return The function returns either ARM_MATH_SIZE_MISMATCH
+ * or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+arm_status arm_mat_trans_q15(
+ const arm_matrix_instance_q15 * pSrc,
+ arm_matrix_instance_q15 * pDst)
+{
+ q15_t *pSrcA = pSrc->pData; /* input data matrix pointer */
+ q15_t *pOut = pDst->pData; /* output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of nRows */
+ uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+ uint16_t col, row = nRows, i = 0u; /* row and column loop counters */
+ arm_status status; /* status of matrix transpose */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ q31_t in; /* variable to hold temporary output */
+
+#else
+
+ q15_t in;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+
+ /* Apply loop unrolling and exchange the columns with row elements */
+ col = nColumns >> 2u;
+
+ /* The pointer pOut is set to starting address of the column being processed */
+ pOut = pDst->pData + i;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(col > 0u)
+ {
+#ifndef UNALIGNED_SUPPORT_DISABLE
+
+ /* Read two elements from the row */
+ in = *__SIMD32(pSrcA)++;
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pOut = (q15_t) in;
+
+#else
+
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Unpack and store the second element in the destination */
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *pOut = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Read two elements from the row */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ in = *__SIMD32(pSrcA)++;
+
+#else
+
+ in = *__SIMD32(pSrcA)++;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Unpack and store one element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pOut = (q15_t) in;
+
+#else
+
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Unpack and store the second element in the destination */
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pOut = (q15_t) ((in & (q31_t) 0xffff0000) >> 16);
+
+#else
+
+ *pOut = (q15_t) in;
+
+#endif /* #ifndef ARM_MATH_BIG_ENDIAN */
+
+#else
+ /* Read one element from the row */
+ in = *pSrcA++;
+
+ /* Store one element in the destination */
+ *pOut = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Read one element from the row */
+ in = *pSrcA++;
+
+ /* Store one element in the destination */
+ *pOut = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Read one element from the row */
+ in = *pSrcA++;
+
+ /* Store one element in the destination */
+ *pOut = in;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Read one element from the row */
+ in = *pSrcA++;
+
+ /* Store one element in the destination */
+ *pOut = in;
+
+#endif /* #ifndef UNALIGNED_SUPPORT_DISABLE */
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ col = nColumns % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer pOut is set to starting address of the column being processed */
+ pOut = pDst->pData + i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(col > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *pOut = *pSrcA++;
+
+ /* Update the pointer pOut to point to the next row of the transposed matrix */
+ pOut += nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ } while(row > 0u);
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q31.c b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q31.c
new file mode 100644
index 0000000..5ac2f1c
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/MatrixFunctions/arm_mat_trans_q31.c
@@ -0,0 +1,207 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mat_trans_q31.c
+*
+* Description: Q31 matrix transpose.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupMatrix
+ */
+
+/**
+ * @addtogroup MatrixTrans
+ * @{
+ */
+
+/*
+ * @brief Q31 matrix transpose.
+ * @param[in] *pSrc points to the input matrix
+ * @param[out] *pDst points to the output matrix
+ * @return The function returns either ARM_MATH_SIZE_MISMATCH
+ * or ARM_MATH_SUCCESS based on the outcome of size checking.
+ */
+
+arm_status arm_mat_trans_q31(
+ const arm_matrix_instance_q31 * pSrc,
+ arm_matrix_instance_q31 * pDst)
+{
+ q31_t *pIn = pSrc->pData; /* input data matrix pointer */
+ q31_t *pOut = pDst->pData; /* output data matrix pointer */
+ q31_t *px; /* Temporary output data matrix pointer */
+ uint16_t nRows = pSrc->numRows; /* number of nRows */
+ uint16_t nColumns = pSrc->numCols; /* number of nColumns */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ uint16_t blkCnt, i = 0u, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* Apply loop unrolling and exchange the columns with row elements */
+ blkCnt = nColumns >> 2u;
+
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+ /* Perform matrix transpose for last 3 samples here. */
+ blkCnt = nColumns % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ uint16_t col, i = 0u, row = nRows; /* loop counters */
+ arm_status status; /* status of matrix transpose */
+
+
+#ifdef ARM_MATH_MATRIX_CHECK
+
+ /* Check for matrix mismatch condition */
+ if((pSrc->numRows != pDst->numCols) || (pSrc->numCols != pDst->numRows))
+ {
+ /* Set status as ARM_MATH_SIZE_MISMATCH */
+ status = ARM_MATH_SIZE_MISMATCH;
+ }
+ else
+#endif /* #ifdef ARM_MATH_MATRIX_CHECK */
+
+ {
+ /* Matrix transpose by exchanging the rows with columns */
+ /* row loop */
+ do
+ {
+ /* The pointer px is set to starting address of the column being processed */
+ px = pOut + i;
+
+ /* Initialize column loop counter */
+ col = nColumns;
+
+ while(col > 0u)
+ {
+ /* Read and store the input element in the destination */
+ *px = *pIn++;
+
+ /* Update the pointer px to point to the next row of the transposed matrix */
+ px += nRows;
+
+ /* Decrement the column loop counter */
+ col--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ i++;
+
+ /* Decrement the row loop counter */
+ row--;
+
+ }
+ while(row > 0u); /* row loop end */
+
+ /* set status as ARM_MATH_SUCCESS */
+ status = ARM_MATH_SUCCESS;
+ }
+
+ /* Return to application */
+ return (status);
+}
+
+/**
+ * @} end of MatrixTrans group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_f32.c
new file mode 100644
index 0000000..069a3b8
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_f32.c
@@ -0,0 +1,177 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_max_f32.c
+*
+* Description: Maximum value of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup Max Maximum
+ *
+ * Computes the maximum value of an array of data.
+ * The function returns both the maximum value and its position within the array.
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult maximum value returned here
+ * @param[out] *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 1u;
+ }
+
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 2u;
+ }
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ float32_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ blkCnt = (blockSize - 1u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Store the maximum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q15.c
new file mode 100644
index 0000000..424eee8
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q15.c
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_max_q15.c
+*
+* Description: Maximum value of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult maximum value returned here
+ * @param[out] *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 1u;
+ }
+
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 2u;
+ }
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q15_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ blkCnt = (blockSize - 1u);
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Store the maximum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q31.c
new file mode 100644
index 0000000..00e6a19
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q31.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_max_q31.c
+*
+* Description: Maximum value of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult maximum value returned here
+ * @param[out] *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 1u;
+ }
+
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 2u;
+ }
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q31_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ blkCnt = (blockSize - 1u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Store the maximum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Max group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q7.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q7.c
new file mode 100644
index 0000000..5461e95
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_max_q7.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_max_q7.c
+*
+* Description: Maximum value of a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Max
+ * @{
+ */
+
+
+/**
+ * @brief Maximum value of a Q7 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult maximum value returned here
+ * @param[out] *pIndex index of maximum value returned here
+ * @return none.
+ */
+
+void arm_max_q7(
+ q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t maxVal1, maxVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 1u;
+ }
+
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 2u;
+ }
+
+ maxVal2 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the maximum value */
+ if(out < maxVal2)
+ {
+ /* Update the maximum value and its index */
+ out = maxVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q7_t maxVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ blkCnt = (blockSize - 1u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* Initialize maxVal to the next consecutive values one by one */
+ maxVal1 = *pSrc++;
+
+ /* compare for the maximum value */
+ if(out < maxVal1)
+ {
+ /* Update the maximum value and it's index */
+ out = maxVal1;
+ outIndex = blockSize - blkCnt;
+ }
+ /* Decrement the loop counter */
+ blkCnt--;
+
+ }
+
+ /* Store the maximum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+
+}
+
+/**
+ * @} end of Max group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_f32.c
new file mode 100644
index 0000000..657553f
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_f32.c
@@ -0,0 +1,130 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mean_f32.c
+*
+* Description: Mean value of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup mean Mean
+ *
+ * Calculates the mean of the input vector. Mean is defined as the average of the elements in the vector.
+ * The underlying algorithm is used:
+ *
+ *
+ *
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+
+/**
+ * @brief Mean value of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult mean value returned here
+ * @return none.
+ */
+
+
+void arm_mean_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* Temporary result storage */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32_t in1, in2, in3, in4;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in1 = *pSrc++;
+ in2 = *pSrc++;
+ in3 = *pSrc++;
+ in4 = *pSrc++;
+
+ sum += in1;
+ sum += in2;
+ sum += in3;
+ sum += in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = sum / (float32_t) blockSize;
+}
+
+/**
+ * @} end of mean group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q15.c
new file mode 100644
index 0000000..d949f39
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q15.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mean_q15.c
+*
+* Description: Mean value of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+/**
+ * @brief Mean value of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.15 format and is accumulated in a 32-bit
+ * accumulator in 17.15 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is saturated and truncated to yield a result of 1.15 format.
+ *
+ */
+
+
+void arm_mean_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ q31_t sum = 0; /* Temporary result storage */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t in;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = (q15_t) (sum / blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q31.c
new file mode 100644
index 0000000..d8c39bc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q31.c
@@ -0,0 +1,127 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mean_q31.c
+*
+* Description: Mean value of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+/**
+ * @brief Mean value of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *\par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.31 format and is accumulated in a 64-bit
+ * accumulator in 33.31 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is truncated to yield a result of 1.31 format.
+ *
+ */
+
+
+void arm_mean_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q63_t sum = 0; /* Temporary result storage */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t in1, in2, in3, in4;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in1 = *pSrc++;
+ in2 = *pSrc++;
+ in3 = *pSrc++;
+ in4 = *pSrc++;
+
+ sum += in1;
+ sum += in2;
+ sum += in3;
+ sum += in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = (q31_t) (sum / (int32_t) blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q7.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q7.c
new file mode 100644
index 0000000..aa99828
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_mean_q7.c
@@ -0,0 +1,124 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_mean_q7.c
+*
+* Description: Mean value of a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup mean
+ * @{
+ */
+
+/**
+ * @brief Mean value of a Q7 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult mean value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.7 format and is accumulated in a 32-bit
+ * accumulator in 25.7 format.
+ * There is no risk of internal overflow with this approach, and the
+ * full precision of intermediate result is preserved.
+ * Finally, the accumulator is truncated to yield a result of 1.7 format.
+ *
+ */
+
+
+void arm_mean_q7(
+ q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult)
+{
+ q31_t sum = 0; /* Temporary result storage */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t in;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ in = *__SIMD32(pSrc)++;
+
+ sum += ((in << 24) >> 24);
+ sum += ((in << 16) >> 24);
+ sum += ((in << 8) >> 24);
+ sum += (in >> 24);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ sum += *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) / blockSize */
+ /* Store the result to the destination */
+ *pResult = (q7_t) (sum / (int32_t) blockSize);
+}
+
+/**
+ * @} end of mean group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_f32.c
new file mode 100644
index 0000000..73be7a4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_f32.c
@@ -0,0 +1,174 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_min_f32.c
+*
+* Description: Minimum value of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup Min Minimum
+ *
+ * Computes the minimum value of an array of data.
+ * The function returns both the minimum value and its position within the array.
+ * There are separate functions for floating-point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult minimum value returned here
+ * @param[out] *pIndex index of minimum value returned here
+ * @return none.
+ *
+ */
+
+void arm_min_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 1u;
+ }
+
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 2u;
+ }
+
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u ) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ float32_t minVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ blkCnt = (blockSize - 1u);
+
+#endif // #ifndef ARM_MATH_CM0
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ blkCnt--;
+
+ }
+
+ /* Store the minimum value and it's index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q15.c
new file mode 100644
index 0000000..3b58ecc
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q15.c
@@ -0,0 +1,168 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_min_q15.c
+*
+* Description: Minimum value of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult minimum value returned here
+ * @param[out] *pIndex index of minimum value returned here
+ * @return none.
+ *
+ */
+
+void arm_min_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q15_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 1u;
+ }
+
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 2u;
+ }
+
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u ) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q15_t minVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ blkCnt = (blockSize - 1u);
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#endif // #ifndef ARM_MATH_CM0
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ blkCnt--;
+
+ }
+
+
+
+ /* Store the minimum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q31.c
new file mode 100644
index 0000000..96d4ab1
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q31.c
@@ -0,0 +1,167 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_min_q31.c
+*
+* Description: Minimum value of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult minimum value returned here
+ * @param[out] *pIndex index of minimum value returned here
+ * @return none.
+ *
+ */
+
+void arm_min_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 1u;
+ }
+
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 2u;
+ }
+
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u ) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q31_t minVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ blkCnt = (blockSize - 1u);
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+#endif // #ifndef ARM_MATH_CM0
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ blkCnt--;
+
+ }
+
+ /* Store the minimum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+}
+
+/**
+ * @} end of Min group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q7.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q7.c
new file mode 100644
index 0000000..076e250
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_min_q7.c
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_min_q7.c
+*
+* Description: Minimum value of a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup Min
+ * @{
+ */
+
+
+/**
+ * @brief Minimum value of a Q7 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult minimum value returned here
+ * @param[out] *pIndex index of minimum value returned here
+ * @return none.
+ *
+ */
+
+void arm_min_q7(
+ q7_t * pSrc,
+ uint32_t blockSize,
+ q7_t * pResult,
+ uint32_t * pIndex)
+{
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q7_t minVal1, minVal2, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex, count; /* loop counter */
+
+ /* Initialise the count value. */
+ count = 0u;
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ /* Loop unrolling */
+ blkCnt = (blockSize - 1u) >> 2u;
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 1u;
+ }
+
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 2u;
+ }
+
+ minVal2 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and its index */
+ out = minVal1;
+ outIndex = count + 3u;
+ }
+
+ /* compare for the minimum value */
+ if(out > minVal2)
+ {
+ /* Update the minimum value and its index */
+ out = minVal2;
+ outIndex = count + 4u;
+ }
+
+ count += 4u;
+
+ blkCnt--;
+ }
+
+ /* if (blockSize - 1u ) is not multiple of 4 */
+ blkCnt = (blockSize - 1u) % 4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q7_t minVal1, out; /* Temporary variables to store the output value. */
+ uint32_t blkCnt, outIndex; /* loop counter */
+
+ /* Initialise the index value to zero. */
+ outIndex = 0u;
+ /* Load first input value that act as reference value for comparision */
+ out = *pSrc++;
+
+ blkCnt = (blockSize - 1u);
+
+#endif // #ifndef ARM_MATH_CM0
+
+ while(blkCnt > 0)
+ {
+ /* Initialize minVal to the next consecutive values one by one */
+ minVal1 = *pSrc++;
+
+ /* compare for the minimum value */
+ if(out > minVal1)
+ {
+ /* Update the minimum value and it's index */
+ out = minVal1;
+ outIndex = blockSize - blkCnt;
+ }
+
+ blkCnt--;
+
+ }
+
+ /* Store the minimum value and its index into destination pointers */
+ *pResult = out;
+ *pIndex = outIndex;
+
+
+}
+
+/**
+ * @} end of Min group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_f32.c
new file mode 100644
index 0000000..1611d12
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_f32.c
@@ -0,0 +1,137 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_power_f32.c
+*
+* Description: Sum of the squares of the elements of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup power Power
+ *
+ * Calculates the sum of the squares of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ *
+ *
+ * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+
+/**
+ * @brief Sum of the squares of the elements of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult sum of the squares value returned here
+ * @return none.
+ *
+ */
+
+
+void arm_power_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* accumulator */
+ float32_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* compute power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Store the result to the destination */
+ *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q15.c
new file mode 100644
index 0000000..5c395cd
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q15.c
@@ -0,0 +1,143 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_power_q15.c
+*
+* Description: Sum of the squares of the elements of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 34.30 format.
+ *
+ */
+
+void arm_power_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ q63_t sum = 0; /* Temporary result storage */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in32; /* Temporary variable to store input value */
+ q15_t in16; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+
+ /* loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in32 = *__SIMD32(pSrc)++;
+ sum = __SMLALD(in32, in32, sum);
+ in32 = *__SIMD32(pSrc)++;
+ sum = __SMLALD(in32, in32, sum);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in16 = *pSrc++;
+ sum = __SMLALD(in16, in16, sum);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t in; /* Temporary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Store the results in 34.30 format */
+ *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q31.c
new file mode 100644
index 0000000..b515826
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q31.c
@@ -0,0 +1,134 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_power_q31.c
+*
+* Description: Sum of the squares of the elements of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.31 format.
+ * Intermediate multiplication yields a 2.62 format, and this
+ * result is truncated to 2.48 format by discarding the lower 14 bits.
+ * The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
+ * With 15 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 16.48 format.
+ *
+ */
+
+void arm_power_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ q63_t sum = 0; /* Temporary result storage */
+ q31_t in;
+ uint32_t blkCnt; /* loop counter */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and then store the result in a temporary variable sum, providing 15 guard bits. */
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14u;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14u;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14u;
+
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q63_t) in * in) >> 14u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Store the results in 16.48 format */
+ *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q7.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q7.c
new file mode 100644
index 0000000..e3ed7ca
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_power_q7.c
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_power_q7.c
+*
+* Description: Sum of the squares of the elements of a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup power
+ * @{
+ */
+
+/**
+ * @brief Sum of the squares of the elements of a Q7 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult sum of the squares value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 32-bit internal accumulator.
+ * The input is represented in 1.7 format.
+ * Intermediate multiplication yields a 2.14 format, and this
+ * result is added without saturation to an accumulator in 18.14 format.
+ * With 17 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the return result is in 18.14 format.
+ *
+ */
+
+void arm_power_q7(
+ q7_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q31_t sum = 0; /* Temporary result storage */
+ q7_t in; /* Temporary variable to store input */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t input1; /* Temporary variable to store packed input */
+ q31_t in1, in2; /* Temporary variables to store input */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* Reading two inputs of pSrc vector and packing */
+ input1 = *__SIMD32(pSrc)++;
+
+ in1 = __SXTB16(__ROR(input1, 8));
+ in2 = __SXTB16(input1);
+
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* calculate power and accumulate to accumulator */
+ sum = __SMLAD(in1, in1, sum);
+ sum = __SMLAD(in2, in2, sum);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute Power and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += ((q15_t) in * in);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Store the result in 18.14 format */
+ *pResult = sum;
+}
+
+/**
+ * @} end of power group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_f32.c
new file mode 100644
index 0000000..caa59b7
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_f32.c
@@ -0,0 +1,132 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_rms_f32.c
+*
+* Description: Root mean square value of an array of F32 type
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup RMS Root mean square (RMS)
+ *
+ *
+ * Calculates the Root Mean Sqaure of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ *
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+
+/**
+ * @brief Root Mean Square of the elements of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult rms value returned here
+ * @return none.
+ *
+ */
+
+void arm_rms_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* Accumulator */
+ float32_t in; /* Tempoprary variable to store input value */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute sum of the squares and then store the result in a temporary variable, sum */
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ in = *pSrc++;
+ sum += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Rms and store the result in the destination */
+ arm_sqrt_f32(sum / (float32_t) blockSize, pResult);
+}
+
+/**
+ * @} end of RMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q15.c
new file mode 100644
index 0000000..c10d6ee
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q15.c
@@ -0,0 +1,152 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_rms_q15.c
+*
+* Description: Root Mean Square of the elements of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+/**
+ * @brief Root Mean Square of the elements of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult rms value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ *
+ */
+
+void arm_rms_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ q63_t sum = 0; /* accumulator */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in; /* temporary variable to store the input value */
+ q15_t in1; /* temporary variable to store the input value */
+ uint32_t blkCnt; /* loop counter */
+
+ /* loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ in = *__SIMD32(pSrc)++;
+ sum = __SMLALD(in, in, sum);
+ in = *__SIMD32(pSrc)++;
+ sum = __SMLALD(in, in, sum);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ in1 = *pSrc++;
+ sum = __SMLALD(in1, in1, sum);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Truncating and saturating the accumulator to 1.15 format */
+ sum = __SSAT((q31_t) (sum >> 15), 16);
+
+ in1 = (q15_t) (sum / blockSize);
+
+ /* Store the result in the destination */
+ arm_sqrt_q15(in1, pResult);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t in; /* temporary variable to store the input value */
+ uint32_t blkCnt; /* loop counter */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ in = *pSrc++;
+ sum += ((q31_t) in * in);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Truncating and saturating the accumulator to 1.15 format */
+ sum = __SSAT((q31_t) (sum >> 15), 16);
+
+ in = (q15_t) (sum / blockSize);
+
+ /* Store the result in the destination */
+ arm_sqrt_q15(in, pResult);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of RMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q31.c
new file mode 100644
index 0000000..2845ae4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_rms_q31.c
@@ -0,0 +1,145 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_rms_q31.c
+*
+* Description: Root Mean Square of the elements of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @addtogroup RMS
+ * @{
+ */
+
+
+/**
+ * @brief Root Mean Square of the elements of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult rms value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, and intermediate multiplication
+ * yields a 2.62 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows, it wraps around and distorts the result.
+ * In order to avoid overflows completely, the input signal must be scaled down by
+ * log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
+ *
+ */
+
+void arm_rms_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q63_t sum = 0; /* accumulator */
+ q31_t in; /* Temporary variable to store the input */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in1, in2, in3, in4; /* Temporary input variables */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 8 outputs at a time.
+ ** a second loop below computes the remaining 1 to 7 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute sum of the squares and then store the result in a temporary variable, sum */
+ /* read two samples from source buffer */
+ in1 = pSrc[0];
+ in2 = pSrc[1];
+
+ /* calculate power and accumulate to accumulator */
+ sum += (q63_t) in1 *in1;
+ sum += (q63_t) in2 *in2;
+
+ /* read two samples from source buffer */
+ in3 = pSrc[2];
+ in4 = pSrc[3];
+
+ /* calculate power and accumulate to accumulator */
+ sum += (q63_t) in3 *in3;
+ sum += (q63_t) in4 *in4;
+
+
+ /* update source buffer to process next samples */
+ pSrc += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 8, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A[0] * A[0] + A[1] * A[1] + A[2] * A[2] + ... + A[blockSize-1] * A[blockSize-1] */
+ /* Compute sum of the squares and then store the results in a temporary variable, sum */
+ in = *pSrc++;
+ sum += (q63_t) in *in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Convert data in 2.62 to 1.31 by 31 right shifts and saturate */
+
+ sum = __SSAT(sum >> 31, 31);
+
+
+ /* Compute Rms and store the result in the destination vector */
+ arm_sqrt_q31((q31_t) ((q31_t) sum / (int32_t) blockSize), pResult);
+}
+
+/**
+ * @} end of RMS group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_f32.c
new file mode 100644
index 0000000..564da49
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_f32.c
@@ -0,0 +1,187 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_std_f32.c
+*
+* Description: Standard deviation of the elements of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup STD Standard deviation
+ *
+ * Calculates the standard deviation of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ *
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+
+/**
+ * @brief Standard deviation of the elements of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult standard deviation value returned here
+ * @return none.
+ *
+ */
+
+
+void arm_std_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t sumOfSquares = 0.0f; /* Sum of squares */
+ float32_t in; /* input value */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t meanOfSquares, mean, squareOfMean;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
+
+ /* Compute mean of all input values */
+ mean = sum / (float32_t) blockSize;
+
+ /* Compute square of mean */
+ squareOfMean = (mean * mean) * (((float32_t) blockSize) /
+ ((float32_t) blockSize - 1.0f));
+
+ /* Compute standard deviation and then store the result to the destination */
+ arm_sqrt_f32((meanOfSquares - squareOfMean), pResult);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ float32_t squareOfSum; /* Square of Sum */
+ float32_t var; /* Temporary varaince storage */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+ in = *pSrc++;
+ sumOfSquares += in * in;
+
+ /* C = (A[0] + A[1] + ... + A[blockSize-1]) */
+ /* Compute Sum of the input samples
+ * and then store the result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute the square of sum */
+ squareOfSum = ((sum * sum) / (float32_t) blockSize);
+
+ /* Compute the variance */
+ var = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
+
+ /* Compute standard deviation and then store the result to the destination */
+ arm_sqrt_f32(var, pResult);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of STD group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q15.c
new file mode 100644
index 0000000..c486f77
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q15.c
@@ -0,0 +1,196 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_std_q15.c
+*
+* Description: Standard deviation of an array of Q15 type.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+/**
+ * @brief Standard deviation of the elements of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult standard deviation value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ */
+
+void arm_std_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q15_t * pResult)
+{
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
+ q15_t mean; /* mean */
+ uint32_t blkCnt; /* loop counter */
+ q15_t t; /* Temporary variable */
+ q63_t sumOfSquares = 0; /* Accumulator */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in; /* input value */
+ q15_t in1; /* input value */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in1 = *pSrc++;
+ sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
+ sum += in1;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ t = (q15_t) ((1.0 / (blockSize - 1)) * 16384LL);
+ sumOfSquares = __SSAT((sumOfSquares >> 15u), 16u);
+
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 14u);
+
+ /* Compute mean of all input values */
+ t = (q15_t) ((1.0 / (blockSize * (blockSize - 1))) * 32768LL);
+ mean = (q15_t) __SSAT(sum, 16u);
+
+ /* Compute square of mean */
+ squareOfMean = ((q31_t) mean * mean) >> 15;
+ squareOfMean = (q31_t) (((q63_t) squareOfMean * t) >> 15);
+
+ /* mean of the squares minus the square of the mean. */
+ in1 = (q15_t) (meanOfSquares - squareOfMean);
+
+ /* Compute standard deviation and store the result to the destination */
+ arm_sqrt_q15(in1, pResult);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ q15_t in; /* input value */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ t = (q15_t) ((1.0 / (blockSize - 1)) * 16384LL);
+ sumOfSquares = __SSAT((sumOfSquares >> 15u), 16u);
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 14u);
+
+ /* Compute mean of all input values */
+ mean = (q15_t) __SSAT(sum, 16u);
+
+ /* Compute square of mean of the input samples
+ * and then store the result in a temporary variable, squareOfMean.*/
+ t = (q15_t) ((1.0 / (blockSize * (blockSize - 1))) * 32768LL);
+ squareOfMean = ((q31_t) mean * mean) >> 15;
+ squareOfMean = (q31_t) (((q63_t) squareOfMean * t) >> 15);
+
+ /* mean of the squares minus the square of the mean. */
+ in = (q15_t) (meanOfSquares - squareOfMean);
+
+ /* Compute standard deviation and store the result to the destination */
+ arm_sqrt_q15(in, pResult);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+}
+
+/**
+ * @} end of STD group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q31.c
new file mode 100644
index 0000000..fa8d088
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_std_q31.c
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_std_q31.c
+*
+* Description: Standard deviation of an array of Q31 type.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup STD
+ * @{
+ */
+
+
+/**
+ * @brief Standard deviation of the elements of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult standard deviation value returned here
+ * @return none.
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, and intermediate multiplication
+ * yields a 2.62 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by
+ * log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
+ *
+ */
+
+
+void arm_std_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q63_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
+ q31_t mean; /* mean */
+ q31_t in; /* input value */
+ q31_t t; /* Temporary variable */
+ uint32_t blkCnt; /* loop counter */
+ q63_t sumOfSquares = 0; /* Accumulator */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += ((q63_t) (in) * (in));
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += ((q63_t) (in) * (in));
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += ((q63_t) (in) * (in));
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += ((q63_t) (in) * (in));
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += ((q63_t) (in) * (in));
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ t = (q31_t) ((1.0f / (float32_t) (blockSize - 1u)) * 1073741824.0f);
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ sumOfSquares = (sumOfSquares >> 31);
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 30);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+ in = *pSrc++;
+ sumOfSquares += ((q63_t) (in) * (in));
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ t = (q31_t) ((1.0f / (float32_t) (blockSize - 1u)) * 1073741824.0f);
+ sumOfSquares = (sumOfSquares >> 31);
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 30);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Compute mean of all input values */
+ t = (q31_t) ((1.0f / (blockSize * (blockSize - 1u))) * 2147483648.0f);
+ mean = (q31_t) (sum);
+
+ /* Compute square of mean */
+ squareOfMean = (q31_t) (((q63_t) mean * mean) >> 31);
+ squareOfMean = (q31_t) (((q63_t) squareOfMean * t) >> 31);
+
+
+ /* Compute standard deviation and then store the result to the destination */
+ arm_sqrt_q31(meanOfSquares - squareOfMean, pResult);
+
+}
+
+/**
+ * @} end of STD group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_f32.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_f32.c
new file mode 100644
index 0000000..28be961
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_f32.c
@@ -0,0 +1,183 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_var_f32.c
+*
+* Description: Variance of the elements of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @defgroup variance Variance
+ *
+ * Calculates the variance of the elements in the input vector.
+ * The underlying algorithm is used:
+ *
+ *
+ *
+ * There are separate functions for floating point, Q31, and Q15 data types.
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+
+/**
+ * @brief Variance of the elements of a floating-point vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult variance value returned here
+ * @return none.
+ *
+ */
+
+
+void arm_var_f32(
+ float32_t * pSrc,
+ uint32_t blockSize,
+ float32_t * pResult)
+{
+
+ float32_t sum = 0.0f; /* Temporary result storage */
+ float32_t sumOfSquares = 0.0f; /* Sum of squares */
+ float32_t in; /* input value */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ float32_t meanOfSquares, mean, squareOfMean; /* Temporary variables */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sum += in;
+ sumOfSquares += in * in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ meanOfSquares = sumOfSquares / ((float32_t) blockSize - 1.0f);
+
+ /* Compute mean of all input values */
+ mean = sum / (float32_t) blockSize;
+
+ /* Compute square of mean */
+ squareOfMean = (mean * mean) * (((float32_t) blockSize) /
+ ((float32_t) blockSize - 1.0f));
+
+ /* Compute variance and then store the result to the destination */
+ *pResult = meanOfSquares - squareOfMean;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ float32_t squareOfSum; /* Square of Sum */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+ in = *pSrc++;
+ sumOfSquares += in * in;
+
+ /* C = (A[0] + A[1] + ... + A[blockSize-1]) */
+ /* Compute Sum of the input samples
+ * and then store the result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute the square of sum */
+ squareOfSum = ((sum * sum) / (float32_t) blockSize);
+
+ /* Compute the variance */
+ *pResult = ((sumOfSquares - squareOfSum) / (float32_t) (blockSize - 1.0f));
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of variance group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q15.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q15.c
new file mode 100644
index 0000000..08a0e98
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q15.c
@@ -0,0 +1,179 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_var_q15.c
+*
+* Description: Variance of an array of Q15 type.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+/**
+ * @brief Variance of the elements of a Q15 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult variance value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ * \par
+ * The function is implemented using a 64-bit internal accumulator.
+ * The input is represented in 1.15 format.
+ * Intermediate multiplication yields a 2.30 format, and this
+ * result is added without saturation to a 64-bit accumulator in 34.30 format.
+ * With 33 guard bits in the accumulator, there is no risk of overflow, and the
+ * full precision of the intermediate multiplication is preserved.
+ * Finally, the 34.30 result is truncated to 34.15 format by discarding the lower
+ * 15 bits, and then saturated to yield a result in 1.15 format.
+ *
+ */
+
+
+void arm_var_q15(
+ q15_t * pSrc,
+ uint32_t blockSize,
+ q31_t * pResult)
+{
+ q31_t sum = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* Mean of square and square of mean */
+ q15_t mean; /* mean */
+ uint32_t blkCnt; /* loop counter */
+ q15_t t; /* Temporary variable */
+ q63_t sumOfSquares = 0; /* Accumulator */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t in; /* Input variable */
+ q15_t in1; /* Temporary variable */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+ in = *__SIMD32(pSrc)++;
+ sum += ((in << 16) >> 16);
+ sum += (in >> 16);
+ sumOfSquares = __SMLALD(in, in, sumOfSquares);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in1 = *pSrc++;
+ sum += in1;
+ sumOfSquares = __SMLALD(in1, in1, sumOfSquares);
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ t = (q15_t) ((1.0f / (float32_t) (blockSize - 1u)) * 16384);
+ sumOfSquares = __SSAT((sumOfSquares >> 15u), 16u);
+
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 14u);
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ q15_t in; /* Temporary variable */
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sumOfSquares. */
+ in = *pSrc++;
+ sumOfSquares += (in * in);
+
+ /* C = (A[0] + A[1] + A[2] + ... + A[blockSize-1]) */
+ /* Compute sum of all input values and then store the result in a temporary variable, sum. */
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ t = (q15_t) ((1.0f / (float32_t) (blockSize - 1u)) * 16384);
+ sumOfSquares = __SSAT((sumOfSquares >> 15u), 16u);
+ meanOfSquares = (q31_t) ((sumOfSquares * t) >> 14u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ /* Compute mean of all input values */
+ t = (q15_t) ((1.0f / (float32_t) (blockSize * (blockSize - 1u))) * 32768);
+ mean = __SSAT(sum, 16u);
+
+ /* Compute square of mean */
+ squareOfMean = ((q31_t) mean * mean) >> 15;
+ squareOfMean = (q31_t) (((q63_t) squareOfMean * t) >> 15);
+
+ /* Compute variance and then store the result to the destination */
+ *pResult = (meanOfSquares - squareOfMean);
+
+}
+
+/**
+ * @} end of variance group
+ */
diff --git a/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q31.c b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q31.c
new file mode 100644
index 0000000..a675d15
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/StatisticsFunctions/arm_var_q31.c
@@ -0,0 +1,169 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_var_q31.c
+*
+* Description: Variance of an array of Q31 type.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupStats
+ */
+
+/**
+ * @addtogroup variance
+ * @{
+ */
+
+/**
+ * @brief Variance of the elements of a Q31 vector.
+ * @param[in] *pSrc points to the input vector
+ * @param[in] blockSize length of the input vector
+ * @param[out] *pResult variance value returned here
+ * @return none.
+ *
+ * @details
+ * Scaling and Overflow Behavior:
+ *
+ *\par
+ * The function is implemented using an internal 64-bit accumulator.
+ * The input is represented in 1.31 format, and intermediate multiplication
+ * yields a 2.62 format.
+ * The accumulator maintains full precision of the intermediate multiplication results,
+ * but provides only a single guard bit.
+ * There is no saturation on intermediate additions.
+ * If the accumulator overflows it wraps around and distorts the result.
+ * In order to avoid overflows completely the input signal must be scaled down by
+ * log2(blockSize) bits, as a total of blockSize additions are performed internally.
+ * Finally, the 2.62 accumulator is right shifted by 31 bits to yield a 1.31 format value.
+ *
+ */
+
+
+void arm_var_q31(
+ q31_t * pSrc,
+ uint32_t blockSize,
+ q63_t * pResult)
+{
+ q63_t sum = 0, sumSquare = 0; /* Accumulator */
+ q31_t meanOfSquares, squareOfMean; /* square of mean and mean of square */
+ q31_t mean; /* mean */
+ q31_t in; /* input value */
+ q31_t t; /* Temporary variable */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q63_t sumSquare1 = 0; /* Accumulator */
+ q31_t in1, in2, in3, in4; /* Temporary input variables */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ /* read input samples from source buffer */
+ in1 = pSrc[0];
+ in2 = pSrc[1];
+
+ /* calculate sum of inputs */
+ sum += in1;
+ /* calculate sum of squares */
+ sumSquare += ((q63_t) (in1) * (in1));
+ in3 = pSrc[2];
+ sum += in2;
+ sumSquare1 += ((q63_t) (in2) * (in2));
+ in4 = pSrc[3];
+ sum += in3;
+ sumSquare += ((q63_t) (in3) * (in3));
+ sum += in4;
+ sumSquare1 += ((q63_t) (in4) * (in4));
+
+ /* update input pointer to process next samples */
+ pSrc += 4u;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* add two accumulators */
+ sumSquare = sumSquare + sumSquare1;
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1]) */
+ /* Compute Sum of squares of the input samples
+ * and then store the result in a temporary variable, sum. */
+ in = *pSrc++;
+ sumSquare += ((q63_t) (in) * (in));
+ sum += in;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ t = (q31_t) ((1.0f / (float32_t) (blockSize - 1u)) * 1073741824.0f);
+
+ /* Compute Mean of squares of the input samples
+ * and then store the result in a temporary variable, meanOfSquares. */
+ sumSquare = (sumSquare >> 31);
+ meanOfSquares = (q31_t) ((sumSquare * t) >> 30);
+
+ /* Compute mean of all input values */
+ t = (q31_t) ((1.0f / (blockSize * (blockSize - 1u))) * 2147483648.0f);
+ mean = (q31_t) (sum);
+
+ /* Compute square of mean */
+ squareOfMean = (q31_t) (((q63_t) mean * mean) >> 31);
+ squareOfMean = (q31_t) (((q63_t) squareOfMean * t) >> 31);
+
+ /* Compute variance and then store the result to the destination */
+ *pResult = (q63_t) meanOfSquares - squareOfMean;
+
+}
+
+/**
+ * @} end of variance group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_f32.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_f32.c
new file mode 100644
index 0000000..65b9191
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_f32.c
@@ -0,0 +1,129 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_copy_f32.c
+*
+* Description: Copies the elements of a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup copy Vector Copy
+ *
+ * Copies sample by sample from source vector to destination vector.
+ *
+ *
+ * pDst[n] = pSrc[n]; 0 <= n < blockSize.
+ *
+ *
+ * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup copy
+ * @{
+ */
+
+/**
+ * @brief Copies the elements of a floating-point vector.
+ * @param[in] *pSrc points to input vector
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ */
+
+
+void arm_copy_f32(
+ float32_t * pSrc,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32_t in1, in2, in3, in4;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ in1 = *pSrc++;
+ in2 = *pSrc++;
+ in3 = *pSrc++;
+ in4 = *pSrc++;
+
+ *pDst++ = in1;
+ *pDst++ = in2;
+ *pDst++ = in3;
+ *pDst++ = in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of BasicCopy group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q15.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q15.c
new file mode 100644
index 0000000..c3a3f76
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q15.c
@@ -0,0 +1,108 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_copy_q15.c
+*
+* Description: Copies the elements of a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup copy
+ * @{
+ */
+/**
+ * @brief Copies the elements of a Q15 vector.
+ * @param[in] *pSrc points to input vector
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ */
+
+void arm_copy_q15(
+ q15_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Read two inputs */
+ *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
+ *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the value in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of BasicCopy group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q31.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q31.c
new file mode 100644
index 0000000..d078272
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q31.c
@@ -0,0 +1,117 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_copy_q31.c
+*
+* Description: Copies the elements of a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup copy
+ * @{
+ */
+
+/**
+ * @brief Copies the elements of a Q31 vector.
+ * @param[in] *pSrc points to input vector
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ */
+
+void arm_copy_q31(
+ q31_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t in1, in2, in3, in4;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the values in the destination buffer */
+ in1 = *pSrc++;
+ in2 = *pSrc++;
+ in3 = *pSrc++;
+ in4 = *pSrc++;
+
+ *pDst++ = in1;
+ *pDst++ = in2;
+ *pDst++ = in3;
+ *pDst++ = in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the value in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of BasicCopy group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q7.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q7.c
new file mode 100644
index 0000000..ff24f2a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_copy_q7.c
@@ -0,0 +1,109 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_copy_q7.c
+*
+* Description: Copies the elements of a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup copy
+ * @{
+ */
+
+/**
+ * @brief Copies the elements of a Q7 vector.
+ * @param[in] *pSrc points to input vector
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ */
+
+void arm_copy_q7(
+ q7_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ /* 4 samples are copied and stored at a time using SIMD */
+ *__SIMD32(pDst)++ = *__SIMD32(pSrc)++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+ while(blkCnt > 0u)
+ {
+ /* C = A */
+ /* Copy and then store the results in the destination buffer */
+ *pDst++ = *pSrc++;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of BasicCopy group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_f32.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_f32.c
new file mode 100644
index 0000000..d903eaa
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_f32.c
@@ -0,0 +1,128 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fill_f32.c
+*
+* Description: Fills a constant value into a floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup Fill Vector Fill
+ *
+ * Fills the destination vector with a constant value.
+ *
+ *
+ * pDst[n] = value; 0 <= n < blockSize.
+ *
+ *
+ * There are separate functions for floating point, Q31, Q15, and Q7 data types.
+ */
+
+/**
+ * @addtogroup Fill
+ * @{
+ */
+
+/**
+ * @brief Fills a constant value into a floating-point vector.
+ * @param[in] value input value to be filled
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the output vector
+ * @return none.
+ *
+ */
+
+
+void arm_fill_f32(
+ float32_t value,
+ float32_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ float32_t in1 = value;
+ float32_t in2 = value;
+ float32_t in3 = value;
+ float32_t in4 = value;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = in1;
+ *pDst++ = in2;
+ *pDst++ = in3;
+ *pDst++ = in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of Fill group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q15.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q15.c
new file mode 100644
index 0000000..8dad149
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q15.c
@@ -0,0 +1,114 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fill_q15.c
+*
+* Description: Fills a constant value into a Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup Fill
+ * @{
+ */
+
+/**
+ * @brief Fills a constant value into a Q15 vector.
+ * @param[in] value input value to be filled
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the output vector
+ * @return none.
+ *
+ */
+
+void arm_fill_q15(
+ q15_t value,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t packedValue; /* value packed to 32 bits */
+
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* Packing two 16 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PKHBT(value, value, 16u);
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *__SIMD32(pDst)++ = packedValue;
+ *__SIMD32(pDst)++ = packedValue;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of Fill group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q31.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q31.c
new file mode 100644
index 0000000..d84f161
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q31.c
@@ -0,0 +1,115 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fill_q31.c
+*
+* Description: Fills a constant value into a Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup Fill
+ * @{
+ */
+
+/**
+ * @brief Fills a constant value into a Q31 vector.
+ * @param[in] value input value to be filled
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the output vector
+ * @return none.
+ *
+ */
+
+void arm_fill_q31(
+ q31_t value,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+ q31_t in1 = value;
+ q31_t in2 = value;
+ q31_t in3 = value;
+ q31_t in4 = value;
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = in1;
+ *pDst++ = in2;
+ *pDst++ = in3;
+ *pDst++ = in4;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of Fill group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q7.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q7.c
new file mode 100644
index 0000000..ea4e2be
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_fill_q7.c
@@ -0,0 +1,112 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_fill_q7.c
+*
+* Description: Fills a constant value into a Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.7 2010/06/10
+* Misra-C changes done
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup Fill
+ * @{
+ */
+
+/**
+ * @brief Fills a constant value into a Q7 vector.
+ * @param[in] value input value to be filled
+ * @param[out] *pDst points to output vector
+ * @param[in] blockSize length of the output vector
+ * @return none.
+ *
+ */
+
+void arm_fill_q7(
+ q7_t value,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ q31_t packedValue; /* value packed to 32 bits */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* Packing four 8 bit values to 32 bit value in order to use SIMD */
+ packedValue = __PACKq7(value, value, value, value);
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *__SIMD32(pDst)++ = packedValue;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = value */
+ /* Fill the value in the destination buffer */
+ *pDst++ = value;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+}
+
+/**
+ * @} end of Fill group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q15.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q15.c
new file mode 100644
index 0000000..b14d11e
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q15.c
@@ -0,0 +1,195 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_float_to_q15.c
+*
+* Description: Converts the elements of the floating-point vector to Q15 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup float_to_x
+ * @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q15 vector.
+ * @param[in] *pSrc points to the floating-point input vector
+ * @param[out] *pDst points to the Q15 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ * \par Description:
+ * \par
+ * The equation used for the conversion process is:
+ *
+ * \par Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q15 range [0x8000 0x7FFF] will be saturated.
+ * \note
+ * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ *
+ */
+
+
+void arm_float_to_q15(
+ float32_t * pSrc,
+ q15_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#ifdef ARM_MATH_ROUNDING
+
+ float32_t in;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 32768.0f);
+ in += in > 0 ? 0.5f : -0.5f;
+ *pDst++ = (q15_t) (__SSAT((q31_t) (in), 16));
+
+#else
+
+ /* C = A * 32768 */
+ /* convert from float to q15 and then store the results in the destination buffer */
+ *pDst++ = (q15_t) __SSAT((q31_t) (*pIn++ * 32768.0f), 16);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of float_to_x group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q31.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q31.c
new file mode 100644
index 0000000..c45720a
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q31.c
@@ -0,0 +1,202 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_float_to_q31.c
+*
+* Description: Converts the elements of the floating-point vector to Q31 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup float_to_x Convert 32-bit floating point value
+ */
+
+/**
+ * @addtogroup float_to_x
+ * @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q31 vector.
+ * @param[in] *pSrc points to the floating-point input vector
+ * @param[out] *pDst points to the Q31 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ *\par Description:
+ * \par
+ * The equation used for the conversion process is:
+ *
+ *
+ * Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q31 range[0x80000000 0x7FFFFFFF] will be saturated.
+ *
+ * \note In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ */
+
+
+void arm_float_to_q31(
+ float32_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#ifdef ARM_MATH_ROUNDING
+
+ float32_t in;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 32768 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 2147483648.0f);
+ in += in > 0 ? 0.5f : -0.5f;
+ *pDst++ = clip_q63_to_q31((q63_t) (in));
+
+#else
+
+ /* C = A * 2147483648 */
+ /* convert from float to Q31 and then store the results in the destination buffer */
+ *pDst++ = clip_q63_to_q31((q63_t) (*pIn++ * 2147483648.0f));
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of float_to_x group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q7.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q7.c
new file mode 100644
index 0000000..ce79724
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_float_to_q7.c
@@ -0,0 +1,194 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_float_to_q7.c
+*
+* Description: Converts the elements of the floating-point vector to Q7 vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @addtogroup float_to_x
+ * @{
+ */
+
+/**
+ * @brief Converts the elements of the floating-point vector to Q7 vector.
+ * @param[in] *pSrc points to the floating-point input vector
+ * @param[out] *pDst points to the Q7 output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ *\par Description:
+ * \par
+ * The equation used for the conversion process is:
+ *
+ * \par Scaling and Overflow Behavior:
+ * \par
+ * The function uses saturating arithmetic.
+ * Results outside of the allowable Q7 range [0x80 0x7F] will be saturated.
+ * \note
+ * In order to apply rounding, the library should be rebuilt with the ROUNDING macro
+ * defined in the preprocessor section of project options.
+ */
+
+
+void arm_float_to_q7(
+ float32_t * pSrc,
+ q7_t * pDst,
+ uint32_t blockSize)
+{
+ float32_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#ifdef ARM_MATH_ROUNDING
+
+ float32_t in;
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+ while(blkCnt > 0u)
+ {
+
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 128);
+ in += in > 0 ? 0.5 : -0.5;
+ *pDst++ = (q7_t) (__SSAT((q15_t) (in), 8));
+
+#else
+
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ *pDst++ = __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+ while(blkCnt > 0u)
+ {
+#ifdef ARM_MATH_ROUNDING
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ in = *pIn++;
+ in = (in * 128.0f);
+ in += in > 0 ? 0.5f : -0.5f;
+ *pDst++ = (q7_t) (__SSAT((q31_t) (in), 8));
+
+#else
+
+ /* C = A * 128 */
+ /* convert from float to q7 and then store the results in the destination buffer */
+ *pDst++ = (q7_t) __SSAT((q31_t) (*pIn++ * 128.0f), 8);
+
+#endif /* #ifdef ARM_MATH_ROUNDING */
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of float_to_x group
+ */
diff --git a/CMSIS/DSP_Lib/Source/SupportFunctions/arm_q15_to_float.c b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_q15_to_float.c
new file mode 100644
index 0000000..393a938
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/SupportFunctions/arm_q15_to_float.c
@@ -0,0 +1,125 @@
+/* ----------------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_q15_to_float.c
+*
+* Description: Converts the elements of the Q15 vector to floating-point vector.
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* ---------------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupSupport
+ */
+
+/**
+ * @defgroup q15_to_x Convert 16-bit Integer value
+ */
+
+/**
+ * @addtogroup q15_to_x
+ * @{
+ */
+
+
+
+
+/**
+ * @brief Converts the elements of the Q15 vector to floating-point vector.
+ * @param[in] *pSrc points to the Q15 input vector
+ * @param[out] *pDst points to the floating-point output vector
+ * @param[in] blockSize length of the input vector
+ * @return none.
+ *
+ * \par Description:
+ *
+ * The equation used for the conversion process is:
+ *
+ *
+ *
+ */
+
+
+void arm_q7_to_q31(
+ q7_t * pSrc,
+ q31_t * pDst,
+ uint32_t blockSize)
+{
+ q7_t *pIn = pSrc; /* Src pointer */
+ uint32_t blkCnt; /* loop counter */
+
+#ifndef ARM_MATH_CM0
+
+ q31_t in;
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /*loop Unrolling */
+ blkCnt = blockSize >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ while(blkCnt > 0u)
+ {
+ /* C = (q31_t) A << 24 */
+ /* convert from q7 to q31 and then store the results in the destination buffer */
+ in = *__SIMD32(pIn)++;
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (in & 0xFF000000);
+
+#else
+
+ *pDst++ = (in & 0xFF000000);
+ *pDst++ = (__ROR(in, 24)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 16)) & 0xFF000000;
+ *pDst++ = (__ROR(in, 8)) & 0xFF000000;
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ blkCnt = blockSize % 0x4u;
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Loop over blockSize number of values */
+ blkCnt = blockSize;
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+ while(blkCnt > 0u)
+ {
+ /* C = (q31_t) A << 24 */
+ /* convert from q7 to q31 and then store the results in the destination buffer */
+ *pDst++ = (q31_t) * pIn++ << 24;
+
+ /* Decrement the loop counter */
+ blkCnt--;
+ }
+
+}
+
+/**
+ * @} end of q7_to_x group
+ */
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_bitreversal.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_bitreversal.c
new file mode 100644
index 0000000..bb02a34
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_bitreversal.c
@@ -0,0 +1,221 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_bitreversal.c
+*
+* Description: This file has common tables like Bitreverse, reciprocal etc which are used across different functions
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Initial Version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/*
+ * @brief In-place bit reversal function.
+ * @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+ * @param[in] fftSize length of the FFT.
+ * @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table.
+ * @param[in] *pBitRevTab points to the bit reversal table.
+ * @return none.
+ */
+
+void arm_bitreversal_f32(
+ float32_t * pSrc,
+ uint16_t fftSize,
+ uint16_t bitRevFactor,
+ uint16_t * pBitRevTab)
+{
+ uint16_t fftLenBy2, fftLenBy2p1;
+ uint16_t i, j;
+ float32_t in;
+
+ /* Initializations */
+ j = 0u;
+ fftLenBy2 = fftSize >> 1u;
+ fftLenBy2p1 = (fftSize >> 1u) + 1u;
+
+ /* Bit Reversal Implementation */
+ for (i = 0u; i <= (fftLenBy2 - 2u); i += 2u)
+ {
+ if(i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ in = pSrc[2u * i];
+ pSrc[2u * i] = pSrc[2u * j];
+ pSrc[2u * j] = in;
+
+ /* pSrc[i+1u] <-> pSrc[j+1u] */
+ in = pSrc[(2u * i) + 1u];
+ pSrc[(2u * i) + 1u] = pSrc[(2u * j) + 1u];
+ pSrc[(2u * j) + 1u] = in;
+
+ /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+ in = pSrc[2u * (i + fftLenBy2p1)];
+ pSrc[2u * (i + fftLenBy2p1)] = pSrc[2u * (j + fftLenBy2p1)];
+ pSrc[2u * (j + fftLenBy2p1)] = in;
+
+ /* pSrc[i+fftLenBy2p1+1u] <-> pSrc[j+fftLenBy2p1+1u] */
+ in = pSrc[(2u * (i + fftLenBy2p1)) + 1u];
+ pSrc[(2u * (i + fftLenBy2p1)) + 1u] =
+ pSrc[(2u * (j + fftLenBy2p1)) + 1u];
+ pSrc[(2u * (j + fftLenBy2p1)) + 1u] = in;
+
+ }
+
+ /* pSrc[i+1u] <-> pSrc[j+1u] */
+ in = pSrc[2u * (i + 1u)];
+ pSrc[2u * (i + 1u)] = pSrc[2u * (j + fftLenBy2)];
+ pSrc[2u * (j + fftLenBy2)] = in;
+
+ /* pSrc[i+2u] <-> pSrc[j+2u] */
+ in = pSrc[(2u * (i + 1u)) + 1u];
+ pSrc[(2u * (i + 1u)) + 1u] = pSrc[(2u * (j + fftLenBy2)) + 1u];
+ pSrc[(2u * (j + fftLenBy2)) + 1u] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
+
+
+
+/*
+ * @brief In-place bit reversal function.
+ * @param[in, out] *pSrc points to the in-place buffer of Q31 data type.
+ * @param[in] fftLen length of the FFT.
+ * @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ * @param[in] *pBitRevTab points to bit reversal table.
+ * @return none.
+ */
+
+void arm_bitreversal_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ uint16_t * pBitRevTable)
+{
+ uint32_t fftLenBy2, fftLenBy2p1, i, j;
+ q31_t in;
+
+ /* Initializations */
+ j = 0u;
+ fftLenBy2 = fftLen / 2u;
+ fftLenBy2p1 = (fftLen / 2u) + 1u;
+
+ /* Bit Reversal Implementation */
+ for (i = 0u; i <= (fftLenBy2 - 2u); i += 2u)
+ {
+ if(i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ in = pSrc[2u * i];
+ pSrc[2u * i] = pSrc[2u * j];
+ pSrc[2u * j] = in;
+
+ /* pSrc[i+1u] <-> pSrc[j+1u] */
+ in = pSrc[(2u * i) + 1u];
+ pSrc[(2u * i) + 1u] = pSrc[(2u * j) + 1u];
+ pSrc[(2u * j) + 1u] = in;
+
+ /* pSrc[i+fftLenBy2p1] <-> pSrc[j+fftLenBy2p1] */
+ in = pSrc[2u * (i + fftLenBy2p1)];
+ pSrc[2u * (i + fftLenBy2p1)] = pSrc[2u * (j + fftLenBy2p1)];
+ pSrc[2u * (j + fftLenBy2p1)] = in;
+
+ /* pSrc[i+fftLenBy2p1+1u] <-> pSrc[j+fftLenBy2p1+1u] */
+ in = pSrc[(2u * (i + fftLenBy2p1)) + 1u];
+ pSrc[(2u * (i + fftLenBy2p1)) + 1u] =
+ pSrc[(2u * (j + fftLenBy2p1)) + 1u];
+ pSrc[(2u * (j + fftLenBy2p1)) + 1u] = in;
+
+ }
+
+ /* pSrc[i+1u] <-> pSrc[j+1u] */
+ in = pSrc[2u * (i + 1u)];
+ pSrc[2u * (i + 1u)] = pSrc[2u * (j + fftLenBy2)];
+ pSrc[2u * (j + fftLenBy2)] = in;
+
+ /* pSrc[i+2u] <-> pSrc[j+2u] */
+ in = pSrc[(2u * (i + 1u)) + 1u];
+ pSrc[(2u * (i + 1u)) + 1u] = pSrc[(2u * (j + fftLenBy2)) + 1u];
+ pSrc[(2u * (j + fftLenBy2)) + 1u] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTable;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTable += bitRevFactor;
+ }
+}
+
+
+
+/*
+ * @brief In-place bit reversal function.
+ * @param[in, out] *pSrc points to the in-place buffer of Q15 data type.
+ * @param[in] fftLen length of the FFT.
+ * @param[in] bitRevFactor bit reversal modifier that supports different size FFTs with the same bit reversal table
+ * @param[in] *pBitRevTab points to bit reversal table.
+ * @return none.
+ */
+
+void arm_bitreversal_q15(
+ q15_t * pSrc16,
+ uint32_t fftLen,
+ uint16_t bitRevFactor,
+ uint16_t * pBitRevTab)
+{
+ q31_t *pSrc = (q31_t *) pSrc16;
+ q31_t in;
+ uint32_t fftLenBy2, fftLenBy2p1;
+ uint32_t i, j;
+
+ /* Initializations */
+ j = 0u;
+ fftLenBy2 = fftLen / 2u;
+ fftLenBy2p1 = (fftLen / 2u) + 1u;
+
+ /* Bit Reversal Implementation */
+ for (i = 0u; i <= (fftLenBy2 - 2u); i += 2u)
+ {
+ if(i < j)
+ {
+ /* pSrc[i] <-> pSrc[j]; */
+ /* pSrc[i+1u] <-> pSrc[j+1u] */
+ in = pSrc[i];
+ pSrc[i] = pSrc[j];
+ pSrc[j] = in;
+
+ /* pSrc[i + fftLenBy2p1] <-> pSrc[j + fftLenBy2p1]; */
+ /* pSrc[i + fftLenBy2p1+1u] <-> pSrc[j + fftLenBy2p1+1u] */
+ in = pSrc[i + fftLenBy2p1];
+ pSrc[i + fftLenBy2p1] = pSrc[j + fftLenBy2p1];
+ pSrc[j + fftLenBy2p1] = in;
+ }
+
+ /* pSrc[i+1u] <-> pSrc[j+fftLenBy2]; */
+ /* pSrc[i+2] <-> pSrc[j+fftLenBy2+1u] */
+ in = pSrc[i + 1u];
+ pSrc[i + 1u] = pSrc[j + fftLenBy2];
+ pSrc[j + fftLenBy2] = in;
+
+ /* Reading the index for the bit reversal */
+ j = *pBitRevTab;
+
+ /* Updating the bit reversal index depending on the fft length */
+ pBitRevTab += bitRevFactor;
+ }
+}
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_f32.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_f32.c
new file mode 100644
index 0000000..c93d76d
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_f32.c
@@ -0,0 +1,510 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix2_f32.c
+*
+* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Floating point processing function
+*
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.3 2010/11/29
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @defgroup Radix2_CFFT_CIFFT Radix-2 Complex FFT Functions
+ *
+ * \par
+ * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
+ * Computational complexity of CFFT reduces drastically when compared to DFT.
+ * \par
+ * This set of functions implements CFFT/CIFFT
+ * for Q15, Q31, and floating-point data types. The functions operates on in-place buffer which uses same buffer for input and output.
+ * Complex input is stored in input buffer in an interleaved fashion.
+ *
+ * \par
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * 2*fftLen samples through the transform. pSrc points to In-place arrays containing 2*fftLen values.
+ * \par
+ * The pSrc points to the array of in-place buffer of size 2*fftLen and inputs and outputs are stored in an interleaved fashion as shown below.
+ *
{real[0], imag[0], real[1], imag[1],..}
+ *
+ * \par Lengths supported by the transform:
+ * \par
+ * Internally, the function utilize a radix-2 decimation in frequency(DIF) algorithm
+ * and the size of the FFT supported are of the lengths [16, 32, 64, 128, 256, 512, 1024, 2048, 4096].
+ *
+ *
+ * \par Algorithm:
+ *
+ * Complex Fast Fourier Transform:
+ * \par
+ * Input real and imaginary data:
+ *
+ *
+ * \par Instance Structure
+ * A separate instance structure must be defined for each Instance but the twiddle factors and bit reversal tables can be reused.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Initializes twiddle factor table and bit reversal table pointers
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Manually initialize the instance structure as follows:
+ *
+ * \par
+ * where fftLen length of CFFT/CIFFT; ifftFlag Flag for selection of CFFT or CIFFT(Set ifftFlag to calculate CIFFT otherwise calculates CFFT);
+ * bitReverseFlag Flag for selection of output order(Set bitReverseFlag to output in normal order otherwise output in bit reversed order);
+ * pTwiddlepoints to array of twiddle coefficients; pBitRevTable points to the array of bit reversal table.
+ * twidCoefModifier modifier for twiddle factor table which supports all FFT lengths with same table;
+ * pBitRevTable modifier for bit reversal table which supports all FFT lengths with same table.
+ * onebyfftLen value of 1/fftLen to calculate CIFFT;
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the CFFT/CIFFT function.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+/**
+ * @details
+ * @brief Processing function for the floating-point Radix-2 CFFT/CIFFT.
+ * @param[in] *S points to an instance of the floating-point Radix-2 CFFT/CIFFT structure.
+ * @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
+ * @return none.
+ */
+
+void arm_cfft_radix2_f32(
+ const arm_cfft_radix2_instance_f32 * S,
+ float32_t * pSrc)
+{
+
+ if(S->ifftFlag == 1u)
+ {
+ /* Complex IFFT radix-2 */
+ arm_radix2_butterfly_inverse_f32(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier, S->onebyfftLen);
+ }
+ else
+ {
+ /* Complex FFT radix-2 */
+ arm_radix2_butterfly_f32(pSrc, S->fftLen, S->pTwiddle,
+ S->twidCoefModifier);
+ }
+
+ if(S->bitReverseFlag == 1u)
+ {
+ /* Bit Reversal */
+ arm_bitreversal_f32(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+ }
+
+}
+
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
+
+
+
+/* ----------------------------------------------------------------------
+** Internal helper function used by the FFTs
+** ------------------------------------------------------------------- */
+
+/*
+ * @brief Core function for the floating-point CFFT butterfly process.
+ * @param[in, out] *pSrc points to the in-place buffer of floating-point data type.
+ * @param[in] fftLen length of the FFT.
+ * @param[in] *pCoef points to the twiddle coefficient buffer.
+ * @param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
+ * @return none.
+ */
+
+void arm_radix2_butterfly_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ float32_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+
+#ifndef ARM_MATH_CM0
+
+ /* Initializations for the first stage */
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ /* Twiddle coefficients index modifier */
+ ia = ia + twidCoefModifier;
+
+ /* index calculation for the input as, */
+ /* pSrc[i + 0], pSrc[i + fftLen/1] */
+ l = i + n2;
+
+ /* Butterfly implementation */
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2u * l] = xt * cosVal + yt * sinVal;
+
+ pSrc[2u * l + 1u] = yt * cosVal - xt * sinVal;
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2u * l] = xt * cosVal + yt * sinVal;
+
+ pSrc[2u * l + 1u] = yt * cosVal - xt * sinVal;
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ } // groups loop end
+
+#else
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 1; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2 * l] = (cosVal * xt + sinVal * yt); // >> 15;
+ pSrc[2 * l + 1] = (cosVal * yt - sinVal * xt); // >> 15;
+
+ }
+ }
+ twidCoefModifier = twidCoefModifier << 1u;
+ }
+
+#endif // #ifndef ARM_MATH_CM0
+
+}
+
+
+void arm_radix2_butterfly_inverse_f32(
+ float32_t * pSrc,
+ uint32_t fftLen,
+ float32_t * pCoef,
+ uint16_t twidCoefModifier,
+ float32_t onebyfftLen)
+{
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ float32_t xt, yt, cosVal, sinVal;
+
+#ifndef ARM_MATH_CM0
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2u * l] = xt * cosVal - yt * sinVal;
+
+ pSrc[2u * l + 1u] = yt * cosVal + xt * sinVal;
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2u * l] = xt * cosVal - yt * sinVal;
+
+ pSrc[2u * l + 1u] = yt * cosVal + xt * sinVal;
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) * onebyfftLen;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) * onebyfftLen;
+
+ pSrc[2u * l] = xt * onebyfftLen;
+
+ pSrc[2u * l + 1u] = yt * onebyfftLen;
+
+ } // butterfly loop end
+
+#else
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ // loop for stage
+ for (k = fftLen; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = pSrc[2 * i] + pSrc[2 * l];
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = pSrc[2 * l + 1] + pSrc[2 * i + 1];
+
+ pSrc[2u * l] = xt * cosVal - yt * sinVal;
+
+ pSrc[2u * l + 1u] = yt * cosVal + xt * sinVal;
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) * onebyfftLen;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) * onebyfftLen;
+
+ pSrc[2u * l] = xt * onebyfftLen;
+
+ pSrc[2u * l + 1u] = yt * onebyfftLen;
+
+ } // butterfly loop end
+
+#endif // #ifndef ARM_MATH_CM0
+
+}
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_f32.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
new file mode 100644
index 0000000..35a823d
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_f32.c
@@ -0,0 +1,197 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix4_init_f32.c
+*
+* Description: Radix-4 Decimation in Frequency Floating-point CFFT & CIFFT Initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+/**
+* @brief Initialization function for the floating-point CFFT/CIFFT.
+* @param[in,out] *S points to an instance of the floating-point CFFT/CIFFT structure.
+* @param[in] fftLen length of the FFT.
+* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
+* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
+* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
+*
+* \par Description:
+* \par
+* The parameter ifftFlag controls whether a forward or inverse transform is computed.
+* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+* \par
+* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+* \par
+* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+* \par
+* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+arm_status arm_cfft_radix2_init_f32(
+ arm_cfft_radix2_instance_f32 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (float32_t *) twiddleCoef;
+
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+
+ case 4096u:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.000244140625;
+ break;
+
+ case 2048u:
+ /* Initializations of structure parameters for 2048 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.00048828125;
+ break;
+
+ case 1024u:
+ /* Initializations of structure parameters for 1024 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.0009765625f;
+ break;
+
+ case 512u:
+ /* Initializations of structure parameters for 512 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 8u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 8u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+ /* Initialise the 1/fftLen Value */
+ S->onebyfftLen = 0.001953125;
+ break;
+
+ case 256u:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16u;
+ S->bitRevFactor = 16u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ S->onebyfftLen = 0.00390625f;
+ break;
+
+ case 128u:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32u;
+ S->bitRevFactor = 32u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+ S->onebyfftLen = 0.0078125;
+ break;
+
+ case 64u:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64u;
+ S->bitRevFactor = 64u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ S->onebyfftLen = 0.015625f;
+ break;
+
+ case 32u:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 128u;
+ S->bitRevFactor = 128u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+ S->onebyfftLen = 0.03125;
+ break;
+
+ case 16u:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256u;
+ S->bitRevFactor = 256u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ S->onebyfftLen = 0.0625f;
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+ return (status);
+}
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q15.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
new file mode 100644
index 0000000..d8488c4
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q15.c
@@ -0,0 +1,185 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix2_init_q15.c
+*
+* Description: Radix-2 Decimation in Frequency Q15 FFT & IFFT initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+/**
+* @brief Initialization function for the Q15 CFFT/CIFFT.
+* @param[in,out] *S points to an instance of the Q15 CFFT/CIFFT structure.
+* @param[in] fftLen length of the FFT.
+* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
+* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
+* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
+*
+* \par Description:
+* \par
+* The parameter ifftFlag controls whether a forward or inverse transform is computed.
+* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+* \par
+* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+* \par
+* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+* \par
+* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q15(
+ arm_cfft_radix2_instance_q15 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q15_t *) twiddleCoefQ15;
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+ /* Initializations of structure parameters depending on the FFT length */
+ switch (S->fftLen)
+ {
+ case 4096u:
+ /* Initializations of structure parameters for 4096 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+
+ break;
+
+ case 2048u:
+ /* Initializations of structure parameters for 2048 point FFT */
+
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+
+ break;
+
+ case 1024u:
+ /* Initializations of structure parameters for 1024 point FFT */
+ S->twidCoefModifier = 4u;
+ S->bitRevFactor = 4u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+
+ break;
+
+ case 512u:
+ /* Initializations of structure parameters for 512 point FFT */
+ S->twidCoefModifier = 8u;
+ S->bitRevFactor = 8u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+
+ break;
+
+ case 256u:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16u;
+ S->bitRevFactor = 16u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+
+ break;
+
+ case 128u:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32u;
+ S->bitRevFactor = 32u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+
+ break;
+
+ case 64u:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64u;
+ S->bitRevFactor = 64u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+
+ break;
+
+ case 32u:
+ /* Initializations of structure parameters for 32 point FFT */
+ S->twidCoefModifier = 128u;
+ S->bitRevFactor = 128u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+
+ break;
+
+ case 16u:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256u;
+ S->bitRevFactor = 256u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+
+ break;
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+ return (status);
+}
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q31.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
new file mode 100644
index 0000000..2b79382
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_init_q31.c
@@ -0,0 +1,163 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix2_init_q31.c
+*
+* Description: Radix-2 Decimation in Frequency Fixed-point CFFT & CIFFT Initialization function
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* -------------------------------------------------------------------- */
+
+
+#include "arm_math.h"
+#include "arm_common_tables.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+
+/**
+*
+* @brief Initialization function for the Q31 CFFT/CIFFT.
+* @param[in,out] *S points to an instance of the Q31 CFFT/CIFFT structure.
+* @param[in] fftLen length of the FFT.
+* @param[in] ifftFlag flag that selects forward (ifftFlag=0) or inverse (ifftFlag=1) transform.
+* @param[in] bitReverseFlag flag that enables (bitReverseFlag=1) or disables (bitReverseFlag=0) bit reversal of output.
+* @return The function returns ARM_MATH_SUCCESS if initialization is successful or ARM_MATH_ARGUMENT_ERROR if fftLen is not a supported value.
+*
+* \par Description:
+* \par
+* The parameter ifftFlag controls whether a forward or inverse transform is computed.
+* Set(=1) ifftFlag for calculation of CIFFT otherwise CFFT is calculated
+* \par
+* The parameter bitReverseFlag controls whether output is in normal order or bit reversed order.
+* Set(=1) bitReverseFlag for output to be in normal order otherwise output is in bit reversed order.
+* \par
+* The parameter fftLen Specifies length of CFFT/CIFFT process. Supported FFT Lengths are 16, 64, 256, 1024.
+* \par
+* This Function also initializes Twiddle factor table pointer and Bit reversal table pointer.
+*/
+
+arm_status arm_cfft_radix2_init_q31(
+ arm_cfft_radix2_instance_q31 * S,
+ uint16_t fftLen,
+ uint8_t ifftFlag,
+ uint8_t bitReverseFlag)
+{
+ /* Initialise the default arm status */
+ arm_status status = ARM_MATH_SUCCESS;
+
+ /* Initialise the FFT length */
+ S->fftLen = fftLen;
+
+ /* Initialise the Twiddle coefficient pointer */
+ S->pTwiddle = (q31_t *) twiddleCoefQ31;
+ /* Initialise the Flag for selection of CFFT or CIFFT */
+ S->ifftFlag = ifftFlag;
+ /* Initialise the Flag for calculation Bit reversal or not */
+ S->bitReverseFlag = bitReverseFlag;
+
+ /* Initializations of Instance structure depending on the FFT length */
+ switch (S->fftLen)
+ {
+ /* Initializations of structure parameters for 4096 point FFT */
+ case 4096u:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 1u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 1u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) armBitRevTable;
+ break;
+
+ /* Initializations of structure parameters for 2048 point FFT */
+ case 2048u:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 2u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 2u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[1];
+ break;
+
+ /* Initializations of structure parameters for 1024 point FFT */
+ case 1024u:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 4u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 4u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[3];
+ break;
+
+ /* Initializations of structure parameters for 512 point FFT */
+ case 512u:
+ /* Initialise the twiddle coef modifier value */
+ S->twidCoefModifier = 8u;
+ /* Initialise the bit reversal table modifier */
+ S->bitRevFactor = 8u;
+ /* Initialise the bit reversal table pointer */
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[7];
+ break;
+
+ case 256u:
+ /* Initializations of structure parameters for 256 point FFT */
+ S->twidCoefModifier = 16u;
+ S->bitRevFactor = 16u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[15];
+ break;
+
+ case 128u:
+ /* Initializations of structure parameters for 128 point FFT */
+ S->twidCoefModifier = 32u;
+ S->bitRevFactor = 32u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[31];
+ break;
+
+ case 64u:
+ /* Initializations of structure parameters for 64 point FFT */
+ S->twidCoefModifier = 64u;
+ S->bitRevFactor = 64u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[63];
+ break;
+
+ case 32u:
+ /* Initializations of structure parameters for 32 point FFT */
+ S->twidCoefModifier = 128u;
+ S->bitRevFactor = 128u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[127];
+ break;
+
+ case 16u:
+ /* Initializations of structure parameters for 16 point FFT */
+ S->twidCoefModifier = 256u;
+ S->bitRevFactor = 256u;
+ S->pBitRevTable = (uint16_t *) & armBitRevTable[255];
+ break;
+
+
+ default:
+ /* Reporting argument error if fftSize is not valid value */
+ status = ARM_MATH_ARGUMENT_ERROR;
+ break;
+ }
+
+ return (status);
+}
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q15.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q15.c
new file mode 100644
index 0000000..3dcc599
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q15.c
@@ -0,0 +1,711 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix2_q15.c
+*
+* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+*
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @defgroup Radix2_CFFT_CIFFT Radix-2 Complex FFT Functions
+ *
+ * \par
+ * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
+ * Computational complexity of CFFT reduces drastically when compared to DFT.
+ */
+
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+/**
+ * @details
+ * @brief Processing function for the fixed-point CFFT/CIFFT.
+ * @param[in] *S points to an instance of the fixed-point CFFT/CIFFT structure.
+ * @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
+ * @return none.
+ */
+
+void arm_cfft_radix2_q15(
+ const arm_cfft_radix2_instance_q15 * S,
+ q15_t * pSrc)
+{
+
+ if(S->ifftFlag == 1u)
+ {
+ arm_radix2_butterfly_inverse_q15(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ arm_radix2_butterfly_q15(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+
+ arm_bitreversal_q15(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
+
+void arm_radix2_butterfly_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ q15_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+#ifndef ARM_MATH_CM0
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q15_t in;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 2;
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 2;
+ S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+
+#else
+
+ out1 = __SMUSDX(R, coeff) >> 16u;
+ out2 = __SMUAD(coeff, R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ i++;
+ l++;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 2;
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 2;
+ S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+
+#else
+
+ out1 = __SMUSDX(R, coeff) >> 16u;
+ out2 = __SMUAD(coeff, R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+
+#else
+
+ out1 = __SMUSDX(R, coeff) >> 16u;
+ out2 = __SMUAD(coeff, R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ i += n1;
+
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUAD(coeff, R) >> 16;
+ out2 = __SMUSDX(coeff, R);
+
+#else
+
+ out1 = __SMUSDX(R, coeff) >> 16u;
+ out2 = __SMUAD(coeff, R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) = R;
+
+ i += n1;
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) = R;
+
+ } // groups loop end
+
+
+#else
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q15_t xt, yt, cosVal, sinVal;
+
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 2u) - (pSrc[2 * l] >> 2u);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 2u) + (pSrc[2 * l] >> 2u)) >> 1u;
+
+ yt = (pSrc[2 * i + 1] >> 2u) - (pSrc[2 * l + 1] >> 2u);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 2u) + (pSrc[2 * i + 1] >> 2u)) >> 1u;
+
+ pSrc[2u * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2u * l + 1u] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1u;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1u;
+
+ pSrc[2u * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2u * l + 1u] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+#endif // #ifndef ARM_MATH_CM0
+
+}
+
+
+void arm_radix2_butterfly_inverse_q15(
+ q15_t * pSrc,
+ uint32_t fftLen,
+ q15_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+#ifndef ARM_MATH_CM0
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q15_t in;
+ q31_t T, S, R;
+ q31_t coeff, out1, out2;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 2;
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 2;
+ S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+
+ out1 = __SMUADX(R, coeff) >> 16u;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ i++;
+ l++;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+ in = ((int16_t) (T & 0xFFFF)) >> 2;
+ T = ((T >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+ in = ((int16_t) (S & 0xFFFF)) >> 2;
+ S = ((S >> 2) & 0xFFFF0000) | (in & 0xFFFF);
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+
+ out1 = __SMUADX(R, coeff) >> 16u;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+
+#else
+
+ out1 = __SMUADX(R, coeff) >> 16u;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ i += n1;
+
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __SHADD16(T, S);
+
+#ifndef ARM_MATH_BIG_ENDIAN
+
+ out1 = __SMUSD(coeff, R) >> 16;
+ out2 = __SMUADX(coeff, R);
+#else
+
+ out1 = __SMUADX(R, coeff) >> 16u;
+ out2 = __SMUSD(__QSUB(0, coeff), R);
+
+#endif // #ifndef ARM_MATH_BIG_ENDIAN
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) =
+ (q31_t) ((out2) & 0xFFFF0000) | (out1 & 0x0000FFFF);
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ coeff = _SIMD32_OFFSET(pCoef + (ia * 2u));
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+
+ T = _SIMD32_OFFSET(pSrc + (2 * i));
+
+ S = _SIMD32_OFFSET(pSrc + (2 * l));
+
+ R = __QSUB16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2 * i)) = __QADD16(T, S);
+
+ _SIMD32_OFFSET(pSrc + (2u * l)) = R;
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+#else
+
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q15_t xt, yt, cosVal, sinVal;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 2u) - (pSrc[2 * l] >> 2u);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 2u) + (pSrc[2 * l] >> 2u)) >> 1u;
+
+ yt = (pSrc[2 * i + 1] >> 2u) - (pSrc[2 * l + 1] >> 2u);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 2u) + (pSrc[2 * i + 1] >> 2u)) >> 1u;
+
+ pSrc[2u * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2u * l + 1u] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1u;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1u;
+
+ pSrc[2u * l] = (((int16_t) (((q31_t) xt * cosVal) >> 16)) -
+ ((int16_t) (((q31_t) yt * sinVal) >> 16)));
+
+ pSrc[2u * l + 1u] = (((int16_t) (((q31_t) yt * cosVal) >> 16)) +
+ ((int16_t) (((q31_t) xt * sinVal) >> 16)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ } // groups loop end
+
+
+#endif // #ifndef ARM_MATH_CM0
+
+}
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q31.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q31.c
new file mode 100644
index 0000000..9f4d7fb
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix2_q31.c
@@ -0,0 +1,309 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix2_q31.c
+*
+* Description: Radix-2 Decimation in Frequency CFFT & CIFFT Fixed point processing function
+*
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @defgroup Radix2_CFFT_CIFFT Radix-2 Complex FFT Functions
+ *
+ * \par
+ * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
+ * Computational complexity of CFFT reduces drastically when compared to DFT.
+ */
+
+
+/**
+ * @addtogroup Radix2_CFFT_CIFFT
+ * @{
+ */
+
+/**
+ * @details
+ * @brief Processing function for the fixed-point CFFT/CIFFT.
+ * @param[in] *S points to an instance of the fixed-point CFFT/CIFFT structure.
+ * @param[in, out] *pSrc points to the complex data buffer of size 2*fftLen. Processing occurs in-place.
+ * @return none.
+ */
+
+void arm_cfft_radix2_q31(
+ const arm_cfft_radix2_instance_q31 * S,
+ q31_t * pSrc)
+{
+
+ if(S->ifftFlag == 1u)
+ {
+ arm_radix2_butterfly_inverse_q31(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+ else
+ {
+ arm_radix2_butterfly_q31(pSrc, S->fftLen,
+ S->pTwiddle, S->twidCoefModifier);
+ }
+
+ arm_bitreversal_q31(pSrc, S->fftLen, S->bitRevFactor, S->pBitRevTable);
+}
+
+/**
+ * @} end of Radix2_CFFT_CIFFT group
+ */
+
+void arm_radix2_butterfly_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ q31_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q31_t xt, yt, cosVal, sinVal;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 2u) - (pSrc[2 * l] >> 2u);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 2u) + (pSrc[2 * l] >> 2u)) >> 1u;
+
+ yt = (pSrc[2 * i + 1] >> 2u) - (pSrc[2 * l + 1] >> 2u);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 2u) + (pSrc[2 * i + 1] >> 2u)) >> 1u;
+
+ pSrc[2u * l] = (((int32_t) (((q63_t) xt * cosVal) >> 32)) +
+ ((int32_t) (((q63_t) yt * sinVal) >> 32)));
+
+ pSrc[2u * l + 1u] = (((int32_t) (((q63_t) yt * cosVal) >> 32)) -
+ ((int32_t) (((q63_t) xt * sinVal) >> 32)));
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1u;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1u;
+
+ pSrc[2u * l] = (((int32_t) (((q63_t) xt * cosVal) >> 32)) +
+ ((int32_t) (((q63_t) yt * sinVal) >> 32)));
+
+ pSrc[2u * l + 1u] = (((int32_t) (((q63_t) yt * cosVal) >> 32)) -
+ ((int32_t) (((q63_t) xt * sinVal) >> 32)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ i += n1;
+ l = i + n2;
+
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ } // butterfly loop end
+
+}
+
+
+void arm_radix2_butterfly_inverse_q31(
+ q31_t * pSrc,
+ uint32_t fftLen,
+ q31_t * pCoef,
+ uint16_t twidCoefModifier)
+{
+
+ int i, j, k, l;
+ int n1, n2, ia;
+ q31_t xt, yt, cosVal, sinVal;
+
+ //N = fftLen;
+ n2 = fftLen;
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (i = 0; i < n2; i++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ l = i + n2;
+ xt = (pSrc[2 * i] >> 2u) - (pSrc[2 * l] >> 2u);
+ pSrc[2 * i] = ((pSrc[2 * i] >> 2u) + (pSrc[2 * l] >> 2u)) >> 1u;
+
+ yt = (pSrc[2 * i + 1] >> 2u) - (pSrc[2 * l + 1] >> 2u);
+ pSrc[2 * i + 1] =
+ ((pSrc[2 * l + 1] >> 2u) + (pSrc[2 * i + 1] >> 2u)) >> 1u;
+
+ pSrc[2u * l] = (((int32_t) (((q63_t) xt * cosVal) >> 32)) -
+ ((int32_t) (((q63_t) yt * sinVal) >> 32)));
+
+ pSrc[2u * l + 1u] = (((int32_t) (((q63_t) yt * cosVal) >> 32)) +
+ ((int32_t) (((q63_t) xt * sinVal) >> 32)));
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+
+ // loop for stage
+ for (k = fftLen / 2; k > 2; k = k >> 1)
+ {
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ // loop for groups
+ for (j = 0; j < n2; j++)
+ {
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = j; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]) >> 1u;
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]) >> 1u;
+
+ pSrc[2u * l] = (((int32_t) (((q63_t) xt * cosVal) >> 32)) -
+ ((int32_t) (((q63_t) yt * sinVal) >> 32)));
+
+ pSrc[2u * l + 1u] = (((int32_t) (((q63_t) yt * cosVal) >> 32)) +
+ ((int32_t) (((q63_t) xt * sinVal) >> 32)));
+
+ } // butterfly loop end
+
+ } // groups loop end
+
+ twidCoefModifier = twidCoefModifier << 1u;
+ } // stages loop end
+
+ n1 = n2;
+ n2 = n2 >> 1;
+ ia = 0;
+
+ cosVal = pCoef[ia * 2];
+ sinVal = pCoef[(ia * 2) + 1];
+ ia = ia + twidCoefModifier;
+
+ // loop for butterfly
+ for (i = 0; i < fftLen; i += n1)
+ {
+ l = i + n2;
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ i += n1;
+ l = i + n2;
+
+ xt = pSrc[2 * i] - pSrc[2 * l];
+ pSrc[2 * i] = (pSrc[2 * i] + pSrc[2 * l]);
+
+ yt = pSrc[2 * i + 1] - pSrc[2 * l + 1];
+ pSrc[2 * i + 1] = (pSrc[2 * l + 1] + pSrc[2 * i + 1]);
+
+ pSrc[2u * l] = xt;
+
+ pSrc[2u * l + 1u] = yt;
+
+ } // butterfly loop end
+
+}
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix4_f32.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix4_f32.c
new file mode 100644
index 0000000..90008a7
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_cfft_radix4_f32.c
@@ -0,0 +1,1235 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_cfft_radix4_f32.c
+*
+* Description: Radix-4 Decimation in Frequency CFFT & CIFFT Floating point processing function
+*
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+*
+* Version 0.0.5 2010/04/26
+* incorporated review comments and updated with latest CMSIS layer
+*
+* Version 0.0.3 2010/03/10
+* Initial version
+* -------------------------------------------------------------------- */
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @defgroup Radix4_CFFT_CIFFT Radix-4 Complex FFT Functions
+ *
+ * \par
+ * Complex Fast Fourier Transform(CFFT) and Complex Inverse Fast Fourier Transform(CIFFT) is an efficient algorithm to compute Discrete Fourier Transform(DFT) and Inverse Discrete Fourier Transform(IDFT).
+ * Computational complexity of CFFT reduces drastically when compared to DFT.
+ * \par
+ * This set of functions implements CFFT/CIFFT
+ * for Q15, Q31, and floating-point data types. The functions operates on in-place buffer which uses same buffer for input and output.
+ * Complex input is stored in input buffer in an interleaved fashion.
+ *
+ * \par
+ * The functions operate on blocks of input and output data and each call to the function processes
+ * 2*fftLen samples through the transform. pSrc points to In-place arrays containing 2*fftLen values.
+ * \par
+ * The pSrc points to the array of in-place buffer of size 2*fftLen and inputs and outputs are stored in an interleaved fashion as shown below.
+ *
{real[0], imag[0], real[1], imag[1],..}
+ *
+ * \par Lengths supported by the transform:
+ * \par
+ * Internally, the function utilize a radix-4 decimation in frequency(DIF) algorithm
+ * and the size of the FFT supported are of the lengths [16, 64, 256, 1024].
+ *
+ *
+ * \par Algorithm:
+ *
+ * Complex Fast Fourier Transform:
+ * \par
+ * Input real and imaginary data:
+ *
+ *
+ * \par Instance Structure
+ * A separate instance structure must be defined for each Instance but the twiddle factors and bit reversal tables can be reused.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Initializes twiddle factor table and bit reversal table pointers
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Manually initialize the instance structure as follows:
+ *
+ * where \c N is the length of the DCT4; \c Nby2 is half of the length of the DCT4;
+ * \c normalize is normalizing factor used and is equal to sqrt(2/N);
+ * \c pTwiddle points to the twiddle factor table;
+ * \c pCosFactor points to the cosFactor table;
+ * \c pRfft points to the real FFT instance;
+ * \c pCfft points to the complex FFT instance;
+ * The CFFT and RFFT structures also needs to be initialized, refer to arm_cfft_radix4_f32()
+ * and arm_rfft_f32() respectively for details regarding static initialization.
+ *
+ * \par Fixed-Point Behavior
+ * Care must be taken when using the fixed-point versions of the DCT4 transform functions.
+ * In particular, the overflow and saturation behavior of the accumulator used in each function must be considered.
+ * Refer to the function specific documentation below for usage guidelines.
+ */
+
+ /**
+ * @addtogroup DCT4_IDCT4
+ * @{
+ */
+
+/**
+ * @brief Processing function for the floating-point DCT4/IDCT4.
+ * @param[in] *S points to an instance of the floating-point DCT4/IDCT4 structure.
+ * @param[in] *pState points to state buffer.
+ * @param[in,out] *pInlineBuffer points to the in-place input and output buffer.
+ * @return none.
+ */
+
+void arm_dct4_f32(
+ const arm_dct4_instance_f32 * S,
+ float32_t * pState,
+ float32_t * pInlineBuffer)
+{
+ uint32_t i; /* Loop counter */
+ float32_t *weights = S->pTwiddle; /* Pointer to the Weights table */
+ float32_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */
+ float32_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */
+ float32_t in; /* Temporary variable */
+
+
+ /* DCT4 computation involves DCT2 (which is calculated using RFFT)
+ * along with some pre-processing and post-processing.
+ * Computational procedure is explained as follows:
+ * (a) Pre-processing involves multiplying input with cos factor,
+ * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
+ * where,
+ * r(n) -- output of preprocessing
+ * u(n) -- input to preprocessing(actual Source buffer)
+ * (b) Calculation of DCT2 using FFT is divided into three steps:
+ * Step1: Re-ordering of even and odd elements of input.
+ * Step2: Calculating FFT of the re-ordered input.
+ * Step3: Taking the real part of the product of FFT output and weights.
+ * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * where,
+ * Y4 -- DCT4 output, Y2 -- DCT2 output
+ * (d) Multiplying the output with the normalizing factor sqrt(2/N).
+ */
+
+ /*-------- Pre-processing ------------*/
+ /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
+ arm_scale_f32(pInlineBuffer, 2.0f, pInlineBuffer, S->N);
+ arm_mult_f32(pInlineBuffer, cosFact, pInlineBuffer, S->N);
+
+ /* ----------------------------------------------------------------
+ * Step1: Re-ordering of even and odd elements as,
+ * pState[i] = pInlineBuffer[2*i] and
+ * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
+ ---------------------------------------------------------------------*/
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
+ pS2 = pState + (S->N - 1u);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+#ifndef ARM_MATH_CM0
+
+ /* Run the below code for Cortex-M4 and Cortex-M3 */
+
+ /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
+ i = (uint32_t) S->Nby2 >> 2u;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ *pS1++ = *pbuff++;
+ *pS2-- = *pbuff++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = (uint32_t) S->N >> 2u;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4.
+ * Compute 4 outputs at a time */
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
+ i = ((uint32_t) S->N - 1u) >> 2u;
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ * (float32_t) 0.5;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* First part of the processing with loop unrolling. Compute 4 outputs at a time.
+ ** a second loop below computes the remaining 1 to 3 samples. */
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+ /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
+ ** No loop unrolling is used. */
+ i = ((uint32_t) S->N - 1u) % 0x4u;
+
+ while(i > 0u)
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ }
+
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing the loop counter to N/4 instead of N for loop unrolling */
+ i = (uint32_t) S->N >> 2u;
+
+ /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+
+#else
+
+ /* Run the below code for Cortex-M0 */
+
+ /* Initializing the loop counter to N/2 */
+ i = (uint32_t) S->Nby2;
+
+ do
+ {
+ /* Re-ordering of even and odd elements */
+ /* pState[i] = pInlineBuffer[2*i] */
+ *pS1++ = *pbuff++;
+ /* pState[N-i-1] = pInlineBuffer[2*i+1] */
+ *pS2-- = *pbuff++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+ /* pbuff initialized to input buffer */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Initializing the loop counter */
+ i = (uint32_t) S->N;
+
+ do
+ {
+ /* Writing the re-ordered output back to inplace input buffer */
+ *pbuff++ = *pS1++;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+
+ /* ---------------------------------------------------------
+ * Step2: Calculate RFFT for N-point input
+ * ---------------------------------------------------------- */
+ /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
+ arm_rfft_f32(S->pRfft, pInlineBuffer, pState);
+
+ /*----------------------------------------------------------------------
+ * Step3: Multiply the FFT output with the weights.
+ *----------------------------------------------------------------------*/
+ arm_cmplx_mult_cmplx_f32(pState, weights, pState, S->N);
+
+ /* ----------- Post-processing ---------- */
+ /* DCT-IV can be obtained from DCT-II by the equation,
+ * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
+ * Hence, Y4(0) = Y2(0)/2 */
+ /* Getting only real part from the output and Converting to DCT-IV */
+
+ /* pbuff initialized to input buffer. */
+ pbuff = pInlineBuffer;
+
+ /* pS1 initialized to pState */
+ pS1 = pState;
+
+ /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
+ in = *pS1++ * (float32_t) 0.5;
+ /* input buffer acts as inplace, so output values are stored in the input itself. */
+ *pbuff++ = in;
+
+ /* pState pointer is incremented twice as the real values are located alternatively in the array */
+ pS1++;
+
+ /* Initializing the loop counter */
+ i = ((uint32_t) S->N - 1u);
+
+ do
+ {
+ /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
+ /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
+ in = *pS1++ - in;
+ *pbuff++ = in;
+ /* points to the next real value */
+ pS1++;
+
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+
+ /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
+
+ /* Initializing the loop counter */
+ i = (uint32_t) S->N;
+
+ /* pbuff initialized to the pInlineBuffer(now contains the output values) */
+ pbuff = pInlineBuffer;
+
+ do
+ {
+ /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
+ in = *pbuff;
+ *pbuff++ = in * S->normalize;
+
+ /* Decrement the loop counter */
+ i--;
+ } while(i > 0u);
+
+#endif /* #ifndef ARM_MATH_CM0 */
+
+}
+
+/**
+ * @} end of DCT4_IDCT4 group
+ */
diff --git a/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_init_f32.c b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_init_f32.c
new file mode 100644
index 0000000..6f4109b
--- /dev/null
+++ b/CMSIS/DSP_Lib/Source/TransformFunctions/arm_dct4_init_f32.c
@@ -0,0 +1,16510 @@
+/* ----------------------------------------------------------------------
+* Copyright (C) 2010 ARM Limited. All rights reserved.
+*
+* $Date: 14/06/07 1:57a $Revision: V1.1.0
+*
+* Project: CMSIS DSP Library
+* Title: arm_dct4_init_f32.c
+*
+* Description: Initialization function of DCT-4 & IDCT4 F32
+*
+* Target Processor: Cortex-M4/Cortex-M3/Cortex-M0
+*
+* Version 1.1.0 2012/02/15
+* Updated with more optimizations, bug fixes and minor API changes.
+*
+* Version 1.0.10 2011/7/15
+* Big Endian support added and Merged M0 and M3/M4 Source code.
+*
+* Version 1.0.3 2010/11/29
+* Re-organized the CMSIS folders and updated documentation.
+*
+* Version 1.0.2 2010/11/11
+* Documentation updated.
+*
+* Version 1.0.1 2010/10/05
+* Production release and review comments incorporated.
+*
+* Version 1.0.0 2010/09/20
+* Production release and review comments incorporated.
+* -------------------------------------------------------------------- */
+
+
+#include "arm_math.h"
+
+/**
+ * @ingroup groupTransforms
+ */
+
+/**
+ * @addtogroup DCT4_IDCT4
+ * @{
+ */
+
+/*
+* @brief Weights Table
+*/
+
+/**
+* \par
+* Weights tables are generated using the formula :
+ * Output for real FFT is complex and are in the order of
+ *
{real(0), imag(0), real(1), imag(1), ...}
+ *
+ * Real Inverse Fast Fourier Transform:
+ * \par
+ * Real IFFT of N-point is calculated using Split RIFFT process and CFFT of N/2-point as shown below figure.
+ * \par
+ * \image html RIFFT.gif "Real Inverse Fast Fourier Transform"
+ * \par
+ * The RIFFT functions operate on blocks of input and output data and each call to the function processes
+ * 2*fftLenR samples through the transform. pSrc points to input array containing 2*fftLenR values.
+ * pDst points to output array containing fftLenR values. \n
+ * Input for real IFFT is complex and are in the order of
+ *
{real(0), imag(0), real(1), imag(1), ...}
+ * Output for real IFFT is real and in the order of
+ *
{real[0], real[1], real[2], real[3], ..}
+ *
+ * \par Lengths supported by the transform:
+ * \par
+ * Real FFT/IFFT supports the lengths [128, 512, 2048], as it internally uses CFFT/CIFFT.
+ *
+ * \par Instance Structure
+ * A separate instance structure must be defined for each Instance but the twiddle factors can be reused.
+ * There are separate instance structure declarations for each of the 3 supported data types.
+ *
+ * \par Initialization Functions
+ * There is also an associated initialization function for each data type.
+ * The initialization function performs the following operations:
+ * - Sets the values of the internal structure fields.
+ * - Initializes twiddle factor tables.
+ * - Initializes CFFT data structure fields.
+ * \par
+ * Use of the initialization function is optional.
+ * However, if the initialization function is used, then the instance structure cannot be placed into a const data section.
+ * To place an instance structure into a const data section, the instance structure must be manually initialized.
+ * Manually initialize the instance structure as follows:
+ *
CMSIS-CORE uses the common coding rules for CMSIS components that are documented under Introduction.
+
CMSIS-CORE violates the following MISRA-C:2004 rules:
+
+
Required Rule 8.5, object/function definition in header file.
+ Violated since function definitions in header files are used to allow 'inlining'.
+
+
+
Required Rule 18.4, declaration of union type or object of union type: '{...}'.
+ Violated since unions are used for effective representation of core registers.
+
+
+
Advisory Rule 19.7, Function-like macro defined.
+ Violated since function-like macros are used to allow more efficient code.
ARM supplies CMSIS-CORE template files for the all supported Cortex-M processors and various compiler vendors. Refer to the list of Tested and Verified Toolchains for compliancy. These template files include the following:
+
+
Register names of the Core Peripherals and names of the Core Exception Vectors.
+
Functions to access core peripherals, special CPU instructions and SIMD instructions (for Cortex-M4)
+
Generic startup code and system configuration code.
+
+
The detailed file structure of the CMSIS-CORE is shown in the following picture.
+
+
+
+CMSIS-CORE File Structure
+
+Template Files
+
The CMSIS-CORE template files should be extended by the silicon vendor to reflect the actual device and device peripherals. Silicon vendors add in this context the:
+
+
Device Peripheral Access Layer that provides definitions for device-specific peripherals.
+
Access Functions for Peripherals (optional) that provides additional helper functions to access device-specific peripherals.
+
Interrupt vectors in the startup file that are device specific.
Generic device header file. Needs to be extended with the device-specific peripheral registers. Optionally functions that access the peripherals can be part of that file.
ARM provides CMSIS-CORE files for the supported ARM Processors and for various compiler vendors. These files can be used when standard ARM processors should be used in a project. The table below lists the folder and device names of the ARM processors.
+
+
+
Folder
Processor
Description
+
+
".\Device\ARM\ARMCM0"
Cortex-M0
Contains Include and Source template files configured for the Cortex-M0 processor. The device name is ARMCM0 and the name of the Device Header File <device.h> is <ARMCM0.h>.
+
+
".\Device\ARM\ARMCM0plus"
Cortex-M0+
Contains Include and Source template files configured for the Cortex-M0+ processor. The device name is ARMCM0plus and the name of the Device Header File <device.h> is <ARMCM0plus.h>.
+
+
".\Device\ARM\ARMCM3"
Cortex-M3
Contains Include and Source template files configured for the Cortex-M3 processor. The device name is ARMCM3 and the name of the Device Header File <device.h> is <ARMCM3.h>.
+
+
".\Device\ARM\ARMCM4"
Cortex-M4
Contains Include and Source template files configured for the Cortex-M4 processor. The device name is ARMCM4 and the name of the Device Header File <device.h> is <ARMCM4.h>.
+
+
".\Device\ARM\ARMSC000"
SecurCore SC000
Contains Include and Source template files configured for the SecurCore SC000 processor. The device name is ARMSC000 and the name of the Device Header File <device.h> is <ARMSC000.h>.
+
+
".\Device\ARM\ARMSC300"
SecurCore SC300
Contains Include and Source template files configured for the SecurCore SC300 processor. The device name is ARMSC300 and the name of the Device Header File <device.h> is <ARMSC300.h>.
+
+
+Create generic Libraries with CMSIS
+
The CMSIS Processor and Core Peripheral files allow also to create generic libraries. The CMSIS-DSP Libraries are an example for such a generic library.
+
To build a generic Library set the define __CMSIS_GENERIC and include the relevant core_<cpu>.h CMSIS CPU & Core Access header file for the processor. The define __CMSIS_GENERIC disables device-dependent features such as the SysTick timer and the Interrupt System. Refer to Configuration of the Processor and Core Peripherals for a list of the available core_<cpu>.h header files.
The Startup File startup_<device>.s is executed after reset and calls SystemInit. After the system initialization control is transferred to the C/C++ run-time library which performs initialization and calls the main function in the user code. In addition the Startup File startup_<device>.s contains all exception and interrupt vectors and implements a default function for every interrupt. It may also contain stack and heap configurations for the user application.
The Device Header File <device.h> is the central include file that the application programmer is using in the C source code. It provides the following features:
+
+
Peripheral Access provides a standardized register layout for all peripherals. Optionally functions for device-specific peripherals may be available.
+
Interrupts and Exceptions (NVIC) can be accessed with standardized symbols and functions for the Nested Interrupt Vector Controller (NVIC) are provided.
Debug Access are functions that allow printf-style I/O via the CoreSight Debug Unit and ITM communication.
+
+
+
+
+CMSIS-CORE User Files
+
The CMSIS-CORE are device specific. In addition, the Startup File startup_<device>.s is also compiler vendor specific. The various compiler vendor tool chains may provide folders that contain the CMSIS files for each supported device. Using CMSIS with generic ARM Processors explains how to use CMSIS-CORE for ARM processors.
+
For example, the following files are provided in MDK-ARM to support the STM32F10x Connectivity Line device variants:
Device Peripheral Access Layer provides definitions for the Peripheral Access to all device peripherals. It contains all data structures and the address mapping for device-specific peripherals.
+
Access Functions for Peripherals (optional) provide additional helper functions for peripherals that are useful for programming of these peripherals. Access Functions may be provided as inline functions or can be extern references to a device-specific library provided by the silicon vendor.
Positive IRQn values represent device-specific exceptions (external interrupts). The first device-specific interrupt has the IRQn value 0. The IRQn values needs extension to reflect the device-specific interrupt vector table in the Startup File startup_<device>.s.
+
+
Example:
+
The following example shows the extension of the interrupt vector table for the LPC1100 device family.
+Configuration of the Processor and Core Peripherals
+
The Device Header File <device.h> configures the Cortex-M or SecurCore processor and the core peripherals with #defines that are set prior to including the file core_<cpu>.h.
+
The following tables list the #defines along with the possible values for each processor core. If these #defines are missing default values are used.
+
core_cm0.h
+
+
+
#define
Value Range
Default
Description
+
+
__CM0_REV
0x0000
0x0000
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2
2
Number of priority bits implemented in the NVIC (device specific)
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
core_cm0plus.h
+
+
+
#define
Value Range
Default
Description
+
+
__CM0PLUS_REV
0x0000
0x0000
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2
2
Number of priority bits implemented in the NVIC (device specific)
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
core_cm3.h
+
+
+
#define
Value Range
Default
Description
+
+
__CM3_REV
0x0101 | 0x0200
0x0200
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2 .. 8
4
Number of priority bits implemented in the NVIC (device specific)
+
+
__MPU_PRESENT
0 .. 1
0
Defines if a MPU is present or not
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
core_cm4.h
+
+
+
#define
Value Range
Default
Description
+
+
__CM4_REV
0x0000
0x0000
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2 .. 8
4
Number of priority bits implemented in the NVIC (device specific)
+
+
__MPU_PRESENT
0 .. 1
0
Defines if a MPU is present or not
+
+
__FPU_PRESENT
0 .. 1
0
Defines if a FPU is present or not
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
core_sc000.h
+
+
+
#define
Value Range
Default
Description
+
+
__SC000_REV
0x0000
0x0000
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2
2
Number of priority bits implemented in the NVIC (device specific)
+
+
__MPU_PRESENT
0 .. 1
0
Defines if a MPU is present or not
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
core_sc300.h
+
+
+
#define
Value Range
Default
Description
+
+
__SC300_REV
0x0000
0x0000
Core revision number ([15:8] revision number, [7:0] patch number)
+
+
__NVIC_PRIO_BITS
2 .. 8
4
Number of priority bits implemented in the NVIC (device specific)
+
+
__MPU_PRESENT
0 .. 1
0
Defines if a MPU is present or not
+
+
__Vendor_SysTickConfig
0 .. 1
0
If this define is set to 1, then the default SysTick_Config function is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
+
+
Example
+
The following code exemplifies the configuration of the Cortex-M4 Processor and Core Peripherals.
+
#define __CM4_REV 0x0001 /* Core revision r0p1 */
+#define __MPU_PRESENT 1 /* MPU present or not */
+#define __NVIC_PRIO_BITS 3 /* Number of Bits used for Priority Levels */
+#define __Vendor_SysTickConfig 0 /* Set to 1 if different SysTick Config is used */
+#define __FPU_PRESENT 1 /* FPU present or not */
+.
+.
+#include <core_cm4.h>/* Cortex-M4 processor and core peripherals */
+
+CMSIS Version and Processor Information
+
Defines in the core_cpu.h file identify the version of the CMSIS-CORE and the processor used. The following shows the defines in the various core_cpu.h files that may be used in the Device Header File <device.h> to verify a minimum version or ensure that the right processor core is used.
+
core_cm0.h
+
#define __CM0_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __CM0_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __CM0_CMSIS_VERSION ((__CM0_CMSIS_VERSION_MAIN << 16) | \
+ __CM0_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_M (0x00) /* Cortex-M Core */
+
core_cm0plus.h
+
#define __CM0PLUS_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __CM0PLUS_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __CM0PLUS_CMSIS_VERSION ((__CM0P_CMSIS_VERSION_MAIN << 16) | \
+ __CM0P_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_M (0x00) /* Cortex-M Core */
+
core_cm3.h
+
#define __CM3_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __CM3_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __CM3_CMSIS_VERSION ((__CM3_CMSIS_VERSION_MAIN << 16) | \
+ __CM3_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_M (0x03) /* Cortex-M Core */
+
core_cm4.h
+
#define __CM4_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __CM4_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __CM4_CMSIS_VERSION ((__CM4_CMSIS_VERSION_MAIN << 16) | \
+ __CM4_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_M (0x04) /* Cortex-M Core */
+
core_sc000.h
+
#define __SC000_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __SC000_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __SC000_CMSIS_VERSION ((__SC000_CMSIS_VERSION_MAIN << 16) | \
+ __SC000_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_SC (0) /* Cortex secure core */
+
core_sc300.h
+
#define __SC300_CMSIS_VERSION_MAIN (0x03) /* [31:16] CMSIS HAL main version */
+#define __SC300_CMSIS_VERSION_SUB (0x00) /* [15:0] CMSIS HAL sub version */
+#define __SC300_CMSIS_VERSION ((__SC300_CMSIS_VERSION_MAIN << 16) | \
+ __SC300_CMSIS_VERSION_SUB ) /* CMSIS HAL version number */
+...
+#define __CORTEX_SC (300) /* Cortex secure core */
+
The section Peripheral Access shows examples for peripheral definitions.
+
+Device.h Template File
+
The silicon vendor needs to extend the Device.h template file with the CMSIS features described above. In addition the Device Header File <device.h> may contain functions to access device-specific peripherals. The system_Device.h Template File which is provided as part of the CMSIS specification is shown below.
+
/**************************************************************************//**
+ * @file <Device>.h
+ * @brief CMSIS Cortex-M# Core Peripheral Access Layer Header File for
+ * Device <Device>
+ * @version V3.01
+ * @date 06. March 2012
+ *
+ * @note
+ * Copyright (C) 2010-2012 ARM Limited. All rights reserved.
+ *
+ * @par
+ * ARM Limited (ARM) is supplying this software for use with Cortex-M
+ * processor based microcontrollers. This file can be freely distributed
+ * within development tools that are supporting such ARM based processors.
+ *
+ * @par
+ * THIS SOFTWARE IS PROVIDED "AS IS". NO WARRANTIES, WHETHER EXPRESS, IMPLIED
+ * OR STATUTORY, INCLUDING, BUT NOT LIMITED TO, IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE APPLY TO THIS SOFTWARE.
+ * ARM SHALL NOT, IN ANY CIRCUMSTANCES, BE LIABLE FOR SPECIAL, INCIDENTAL, OR
+ * CONSEQUENTIAL DAMAGES, FOR ANY REASON WHATSOEVER.
+ *
+ ******************************************************************************/
+
+
+#ifndef <Device>_H /* ToDo: replace '<Device>' with your device name */
+#define <Device>_H
+
+#ifdef __cplusplus
+ extern "C" {
+#endif
+
+/* ToDo: replace '<Device>' with your device name; add your doxyGen comment */
+/** @addtogroup <Device>_Definitions <Device> Definitions
+ This file defines all structures and symbols for <Device>:
+ - registers and bitfields
+ - peripheral base address
+ - peripheral ID
+ - Peripheral definitions
+ @{
+*/
+
+
+/******************************************************************************/
+/* Processor and Core Peripherals */
+/******************************************************************************/
+/** @addtogroup <Device>_CMSIS Device CMSIS Definitions
+ Configuration of the Cortex-M# Processor and Core Peripherals
+ @{
+*/
+
+/*
+ * ==========================================================================
+ * ---------- Interrupt Number Definition -----------------------------------
+ * ==========================================================================
+ */
+
+typedef enum IRQn
+{
+/****** Cortex-M# Processor Exceptions Numbers ***************************************************/
+
+/* ToDo: use this Cortex interrupt numbers if your device is a CORTEX-M0 device */
+ NonMaskableInt_IRQn = -14, /*!< 2 Non Maskable Interrupt */
+ HardFault_IRQn = -13, /*!< 3 Hard Fault Interrupt */
+ SVCall_IRQn = -5, /*!< 11 SV Call Interrupt */
+ PendSV_IRQn = -2, /*!< 14 Pend SV Interrupt */
+ SysTick_IRQn = -1, /*!< 15 System Tick Interrupt */
+
+/* ToDo: use this Cortex interrupt numbers if your device is a CORTEX-M3 / Cortex-M4 device */
+ NonMaskableInt_IRQn = -14, /*!< 2 Non Maskable Interrupt */
+ MemoryManagement_IRQn = -12, /*!< 4 Memory Management Interrupt */
+ BusFault_IRQn = -11, /*!< 5 Bus Fault Interrupt */
+ UsageFault_IRQn = -10, /*!< 6 Usage Fault Interrupt */
+ SVCall_IRQn = -5, /*!< 11 SV Call Interrupt */
+ DebugMonitor_IRQn = -4, /*!< 12 Debug Monitor Interrupt */
+ PendSV_IRQn = -2, /*!< 14 Pend SV Interrupt */
+ SysTick_IRQn = -1, /*!< 15 System Tick Interrupt */
+
+/****** Device Specific Interrupt Numbers ********************************************************/
+/* ToDo: add here your device specific external interrupt numbers
+ according the interrupt handlers defined in startup_Device.s
+ eg.: Interrupt for Timer#1 TIM1_IRQHandler -> TIM1_IRQn */
+ <DeviceInterrupt>_IRQn = 0, /*!< Device Interrupt */
+} IRQn_Type;
+
+
+/*
+ * ==========================================================================
+ * ----------- Processor and Core Peripheral Section ------------------------
+ * ==========================================================================
+ */
+
+/* Configuration of the Cortex-M# Processor and Core Peripherals */
+/* ToDo: set the defines according your Device */
+/* ToDo: define the correct core revision
+ __CM0_REV if your device is a CORTEX-M0 device
+ __CM3_REV if your device is a CORTEX-M3 device
+ __CM4_REV if your device is a CORTEX-M4 device */
+#define __CM#_REV 0x0201 /*!< Core Revision r2p1 */
+#define __NVIC_PRIO_BITS 2 /*!< Number of Bits used for Priority Levels */
+#define __Vendor_SysTickConfig 0 /*!< Set to 1 if different SysTick Config is used */
+#define __MPU_PRESENT 0 /*!< MPU present or not */
+/* ToDo: define __FPU_PRESENT if your devise is a CORTEX-M4 */
+#define __FPU_PRESENT 0 /*!< FPU present or not */
+
+/*@}*/ /* end of group <Device>_CMSIS */
+
+
+/* ToDo: include the correct core_cm#.h file
+ core_cm0.h if your device is a CORTEX-M0 device
+ core_cm3.h if your device is a CORTEX-M3 device
+ core_cm4.h if your device is a CORTEX-M4 device */
+#include <core_cm#.h> /* Cortex-M# processor and core peripherals */
+/* ToDo: include your system_<Device>.h file
+ replace '<Device>' with your device name */
+#include "system_<Device>.h" /* <Device> System include file */
+
+
+/******************************************************************************/
+/* Device Specific Peripheral registers structures */
+/******************************************************************************/
+/** @addtogroup <Device>_Peripherals <Device> Peripherals
+ <Device> Device Specific Peripheral registers structures
+ @{
+*/
+
+#if defined ( __CC_ARM )
+#pragma anon_unions
+#endif
+
+/* ToDo: add here your device specific peripheral access structure typedefs
+ following is an example for a timer */
+
+/*------------- 16-bit Timer/Event Counter (TMR) -----------------------------*/
+/** @addtogroup <Device>_TMR <Device> 16-bit Timer/Event Counter (TMR)
+ @{
+*/
+typedef struct
+{
+ __IO uint32_t EN; /*!< Offset: 0x0000 Timer Enable Register */
+ __IO uint32_t RUN; /*!< Offset: 0x0004 Timer RUN Register */
+ __IO uint32_t CR; /*!< Offset: 0x0008 Timer Control Register */
+ __IO uint32_t MOD; /*!< Offset: 0x000C Timer Mode Register */
+ uint32_t RESERVED0[1];
+ __IO uint32_t ST; /*!< Offset: 0x0014 Timer Status Register */
+ __IO uint32_t IM; /*!< Offset: 0x0018 Interrupt Mask Register */
+ __IO uint32_t UC; /*!< Offset: 0x001C Timer Up Counter Register */
+ __IO uint32_t RG0 /*!< Offset: 0x0020 Timer Register */
+ uint32_t RESERVED1[2];
+ __IO uint32_t CP; /*!< Offset: 0x002C Capture register */
+} <DeviceAbbreviation>_TMR_TypeDef;
+/*@}*/ /* end of group <Device>_TMR */
+
+
+#if defined ( __CC_ARM )
+#pragma no_anon_unions
+#endif
+
+/*@}*/ /* end of group <Device>_Peripherals */
+
+
+/******************************************************************************/
+/* Peripheral memory map */
+/******************************************************************************/
+/* ToDo: add here your device peripherals base addresses
+ following is an example for timer */
+/** @addtogroup <Device>_MemoryMap <Device> Memory Mapping
+ @{
+*/
+
+/* Peripheral and SRAM base address */
+#define <DeviceAbbreviation>_FLASH_BASE (0x00000000UL) /*!< (FLASH ) Base Address */
+#define <DeviceAbbreviation>_SRAM_BASE (0x20000000UL) /*!< (SRAM ) Base Address */
+#define <DeviceAbbreviation>_PERIPH_BASE (0x40000000UL) /*!< (Peripheral) Base Address */
+
+/* Peripheral memory map */
+#define <DeviceAbbreviation>TIM0_BASE (<DeviceAbbreviation>_PERIPH_BASE) /*!< (Timer0 ) Base Address */
+#define <DeviceAbbreviation>TIM1_BASE (<DeviceAbbreviation>_PERIPH_BASE + 0x0800) /*!< (Timer1 ) Base Address */
+#define <DeviceAbbreviation>TIM2_BASE (<DeviceAbbreviation>_PERIPH_BASE + 0x1000) /*!< (Timer2 ) Base Address */
+/*@}*/ /* end of group <Device>_MemoryMap */
+
+
+/******************************************************************************/
+/* Peripheral declaration */
+/******************************************************************************/
+/* ToDo: add here your device peripherals pointer definitions
+ following is an example for timer */
+
+/** @addtogroup <Device>_PeripheralDecl <Device> Peripheral Declaration
+ @{
+*/
+
+#define <DeviceAbbreviation>_TIM0 ((<DeviceAbbreviation>_TMR_TypeDef *) <DeviceAbbreviation>TIM0_BASE)
+#define <DeviceAbbreviation>_TIM1 ((<DeviceAbbreviation>_TMR_TypeDef *) <DeviceAbbreviation>TIM0_BASE)
+#define <DeviceAbbreviation>_TIM2 ((<DeviceAbbreviation>_TMR_TypeDef *) <DeviceAbbreviation>TIM0_BASE)
+/*@}*/ /* end of group <Device>_PeripheralDecl */
+
+/*@}*/ /* end of group <Device>_Definitions */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* <Device>_H */
+
The function reads the Application Program Status Register (APSR) using the instruction MRS.
+
+ The APSR contains the current state of the condition flags from instructions executed previously. The APSR is essential for controlling conditional branches. The following flags are used:
+
+
N (APSR[31]) (Negative flag)
+
=1 The instruction result has a negative value (when interpreted as signed integer).
+
=0 The instruction result has a positive value or equal zero.
+
+
+
+
+
Z (APSR[30]) (Zero flag)
+
=1 The instruction result is zero. Or, after a compare instruction, when the two values are the same.
+
+
+
+
+
C (APSR[29]) (Carry or borrow flag)
+
=1 For unsigned additions, if an unsigned overflow occurred.
+
=inverse of borrow output status For unsigned subtract operations.
+
+
+
+
+
V (APSR[28]) (Overflow flag)
+
=1 A signed overflow occurred (for signed additions or subtractions).
+
+
+
+
+
Q (APSR[27]) (DSP overflow or saturation flag) [not Cortex-M0]
+
This flag is a sticky flag. Saturating and certain mutliplying instructions can set the flag, but cannot clear it.
+
=1 When saturation or an overflow occurred.
+
+
+
+
+
GE (APSR[19:16]) (Greater than or Equal flags) [not Cortex-M0]
+
Can be set by the parallel add and subtract instructions.
+
Are used by the SEL instruction to perform byte-based selection from two registers.
+
+
+
+
Returns:
APSR register value
+
Remarks:
+
Some instructions update all flags; some instructions update a subset of the flags.
+
If a flag is not updated, the original value is preserved.
+
Conditional instructions that are not executed have no effect on the flags.
+
The CMSIS does not provide a function to update this register.
The function returns the Base Priority Mask register (BASEPRI) using the instruction MRS.
+
+ BASEPRI defines the minimum priority for exception processing. When BASEPRI is set to a non-zero value, it prevents the activation of all exceptions with the same or lower priority level as the BASEPRI value.
The function reads the CONTROL register value using the instruction MRS.
+
+ The CONTROL register controls the stack used and the privilege level for software execution when the processor is in thread mode and, if implemented, indicates whether the FPU state is active. This register uses the following bits:
+
+
+
CONTROL[2] [only Cortex-M4]
+
=0 FPU not active
+
=1 FPU active
+
+
+
+
+
CONTROL[1]
+
=0 In handler mode - MSP is selected. No alternate stack possible for handler mode.
+
=0 In thread mode - Default stack pointer MSP is used.
+
=1 In thread mode - Alternate stack pointer PSP is used.
+
+
+
+
+
CONTROL[0] [not Cortex-M0]
+
=0 In thread mode and privileged state.
+
=1 In thread mode and user state.
+
+
+
+
Returns:
CONTROL register value
+
Remarks:
+
The processor can be in user state or privileged state when running in thread mode.
+
Exception handlers always run in privileged state.
+
On reset, the processor is in thread mode with privileged access rights.
The function reads the Fault Mask register (FAULTMASK) value using the instruction MRS.
+
+ FAULTMASK prevents activation of all exceptions except for the Non-Maskable Interrupt (NMI).
+
Returns:
FAULTMASK register value
+
Remarks:
+
not for Cortex-M0 variants.
+
Is cleared automatically upon exiting the exception handler, except when returning from the NMI handler.
The function reads the Floating-Point Status Control Register (FPSCR) value.
+
+ FPSCR provides all necessary User level controls of the floating-point system.
The function reads the Interrupt Program Status Register (IPSR) using the instruction MRS.
+
+ The ISPR contains the exception type number of the current Interrupt Service Routine (ISR). Each exception has an assocciated unique IRQn number. The following bits are used:
The function reads the Main Status Pointer (MSP) value using the instruction MRS.
+
+ Physically two different stack pointers (SP) exist:
+
+
The Main Stack Pointer (MSP) is the default stack pointer after reset. It is also used when running exception handlers (handler mode).
+
The Process Stack Pointer (PSP), which can be used only in thread mode.
+
+
Register R13 banks the SP. The SP selection is determined by the bit[1] of the CONTROL register:
+
+
=0 MSP is the current stack pointer. This is also the default SP. The initial value is loaded from the first 32-bit word of the vector table from the program memory.
+
=1 PSP is the current stack pointer. The initial value is undefined.
+
+
Returns:
MSP Register value
+
Remarks:
+
Only one of the two SPs is visible at a time.
+
For many applications, the system can completely rely on the MSP.
+
The PSP is normally used in designs with an OS where the stack memory for OS Kernel must be separated from the application code.
The function reads the Priority Mask register (PRIMASK) value using the instruction MRS.
+
+ PRIMASK is a 1-bit-wide interrupt mask register. When set, it blocks all interrupts apart from the non-maskable interrupt (NMI) and the hard fault exception. The PRIMASK prevents activation of all exceptions with configurable priority.
+
Returns:
PRIMASK register value
+
=0 no effect
+
=1 prevents the activation of all exceptions with configurable priority
The function reads the Program Status Pointer (PSP) value using the instruction MRS.
+
+ Physically two different stack pointers (SP) exist:
+
+
The Main Stack Pointer (MSP) is the default stack pointer after reset. It is also used when running exception handlers (handler mode).
+
The Process Stack Pointer (PSP), which can be used only in thread mode.
+
+
Register R13 banks the SP. The SP selection is determined by the bit[1] of the CONTROL register:
+
+
=0 MSP is the current stack pointer. This is also the default SP. The initial value is loaded from the first 32-bit word of the vector table from the program memory.
+
=1 PSP is the current stack pointer. The initial value is undefined.
+
+
Returns:
PSP register value
+
Remarks:
+
Only one of the two SPs is visible at a time.
+
For many applications, the system can completely rely on the MSP.
+
The PSP is normally used in designs with an OS where the stack memory for OS Kernel must be separated from the application code.
The function reads the combined Program Status Register (xPSR) using the instruction MRS.
+
+ xPSR provides information about program execution and the APSR flags. It consists of the following PSRs:
+
+
Application Program Status Register (APSR)
+
Interrupt Program Status Register (IPSR)
+
Execution Program Status Register (EPSR)
+
+
In addition to the flags described in __get_APSR and __get_IPSR, the register provides the following flags:
+
+
IT (xPSR[26:25]) (If-Then condition instruction)
+
Contains up to four instructions following an IT instruction.
+
Each instruction in the block is conditional.
+
The conditions for the instructions are either all the same, or some can be the inverse of others.
+
+
+
+
+
T (xPSR[24]) (Thumb bit)
+
=1 Indicates that that the processor is in Thumb state.
+
=0 Attempting to execute instructions when the T bit is 0 results in a fault or lockup.
+
The conditions for the instructions are either all the same, or some can be the inverse of others.
+
+
+
+
Returns:
xPSR register value
+
Remarks:
+
The CMSIS does not provide functions that access EPSR.
The function sets the Base Priority Mask register (BASEPRI) value using the instruction MSR.
+
+ BASEPRI defines the minimum priority for exception processing. When BASEPRI is set to a non-zero value, it prevents the activation of all exceptions with the same or lower priority level as the BASEPRI value.
+
Parameters:
+
+
[in]
basePri
BASEPRI value to set
+
+
+
+
Remarks:
+
not for Cortex-M0 variants.
+
Cannot be set in user state.
+
Useful for changing the masking level or disabling the masking.
The function sets the CONTROL register value using the instruction MSR.
+
+ The CONTROL register controls the stack used and the privilege level for software execution when the processor is in thread mode and, if implemented, indicates whether the FPU state is active. This register uses the following bits:
+
+
+
CONTROL[2] [only Cortex-M4]
+
=0 FPU not active
+
=1 FPU active
+
+
+
+
+
CONTROL[1]
+
Writeable only when the processor is in thread mode and privileged state (CONTROL[0]=0).
+
=0 In handler mode - MSP is selected. No alternate stack pointer possible for handler mode.
+
=0 In thread mode - Default stack pointer MSP is used.
+
=1 In thread mode - Alternate stack pointer PSP is used.
+
+
+
+
+
CONTROL[0] [not writeable for Cortex-M0]
+
Writeable only when the processor is in privileged state.
+
Can be used to switch the processor to user state (thread mode).
+
Once in user state, trigger an interrupt and change the state to privileged in the exception handler (the only way).
+
=0 In thread mode and privileged state.
+
=1 In thread mode and user state.
+
+
+
+
Parameters:
+
+
[in]
control
CONTROL register value to set
+
+
+
+
Remarks:
+
The processor can be in user state or privileged state when running in thread mode.
+
Exception handlers always run in privileged state.
+
On reset, the processor is in thread mode with privileged access rights.
The function sets the Fault Mask register (FAULTMASK) value using the instruction MSR.
+
+ FAULTMASK prevents activation of all exceptions except for Non-Maskable Interrupt (NMI). FAULTMASK can be used to escalate a configurable fault handler (BusFault, usage fault, or memory management fault) to hard fault level without invoking a hard fault. This allows the fault handler to pretend to be the hard fault handler, whith the ability to:
+
+
Mask BusFault by setting the BFHFNMIGN in the Configuration Control register. It can be used to test the bus system without causing a lockup.
+
Bypass the MPU, allowing accessing the MPU protected memory location without reprogramming the MPU to just carry out a few transfers for fixing faults.
+
+
Parameters:
+
+
[in]
faultMask
FAULTMASK register value to set
+
+
+
+
Remarks:
+
not for Cortex-M0 variants.
+
Is cleared automatically upon exiting the exception handler, except when returning from the NMI handler.
+
When set, it changes the effective current priority level to -1, so that even the hard fault handler is blocked.
+
Can be used by fault handlers to change their priority to -1 to have access to some features for hard fault exceptions (see above).
+
When set, lockups can still be caused by incorrect or undefined instructions, or by using SVC in the wrong priority level.
The function sets the Floating-Point Status Control Register (FPSCR) value.
+
+ FPSCR provides all necessary User level control of the floating-point system.
+
+
+
N (FPSC[31]) (Negative flag)
+
=1 The instruction result has a negative value (when interpreted as signed integer).
+
=0 The instruction result has a positive value or equal zero.
+
+
+
+
+
Z (FPSC[30]) (Zero flag)
+
=1 The instruction result is zero. Or, after a compare instruction, when the two values are the same.
+
+
+
+
+
C (FPSC[29]) (Carry or borrow flag)
+
=1 For unsigned additions, if an unsigned overflow occurred.
+
=inverse of borrow output status For unsigned subtract operations.
+
+
+
+
+
V (FPSC[28]) (Overflow flag)
+
=1 A signed overflow occurred (for signed additions or subtractions).
+
+
+
+
+
AHP (FPSC[26]) (Alternative half-precision flag)
+
=1 Alternative half-precision format selected.
+
=0 IEEE half-precision format selected.
+
+
+
+
+
DN (FPSC[25]) (Default NaN mode control flag)
+
=1 Any operation involving one or more NaNs returns the Default NaN.
+
=0 NaN operands propagate through to the output of a floating-point operation.
+
+
+
+
+
FZ (FPSC[24]) (Flush-to-zero mode control flag)
+
=1 Flush-to-zero mode enabled.
+
=0 Flush-to-zero mode disabled. Behavior of the floating-point system is fully compliant with the IEEE 754 standard.
+
+
+
+
+
RMode (FPSC[23:22]) (Rounding Mode control flags)
+
=0b00 Round to Nearest (RN) mode.
+
=0b01 Round towards Plus Infinity (RP) mode.
+
=0b10 Round towards Minus Infinity (RM) mode.
+
=0b11 Round towards Zero (RZ) mode.
+
The specified rounding mode is used by almost all floating-point instructions.
+
+
The function sets the Main Status Pointer (MSP) value using the instruction MSR.
+
+ Physically two different stack pointers (SP) exist:
+
+
The Main Stack Pointer (MSP) is the default stack pointer after reset. It is also used when running exception handlers (handler mode).
+
The Process Stack Pointer (PSP), which can be used only in thread mode.
+
+
Register R13 banks the SP. The SP selection is determined by the bit[1] of the CONTROL register:
+
+
=0 MSP is the current stack pointer. This is also the default SP. The initial value is loaded from the first 32-bit word of the vector table from the program memory.
+
=1 PSP is the current stack pointer. The initial value is undefined.
+
+
Parameters:
+
+
[in]
topOfMainStack
MSP value to set
+
+
+
+
Remarks:
+
Only one of the two SPs is visible at a time.
+
For many applications, the system can completely rely on the MSP.
+
The PSP is normally used in designs with an OS where the stack memory for OS Kernel must be separated from the application code.
The function sets the Priority Mask register (PRIMASK) value using the instruction MSR.
+
+ PRIMASK is a 1-bit-wide interrupt mask register. When set, it blocks all interrupts apart from the non-maskable interrupt (NMI) and the hard fault exception. The PRIMASK prevents activation of all exceptions with configurable priority.
+
Parameters:
+
+
[in]
priMask
Priority Mask
+
=0 no effect
+
=1 prevents the activation of all exceptions with configurable priority
+
+
+
+
+
+
Remarks:
+
When set, PRIMASK effectively changes the current priority level to 0. This is the highest programmable level.
+
When set and a fault occurs, the hard fault handler will be executed.
+
Useful for temprorarily disabling all interrupts for timing critical tasks.
+
Does not have the ability to mask BusFault or bypass MPU.
The function sets the Program Status Pointer (PSP) value using the instruction MSR.
+
+ Physically two different stack pointers (SP) exist:
+
+
The Main Stack Pointer (MSP) is the default stack pointer after reset. It is also used when running exception handlers (handler mode).
+
The Process Stack Pointer (PSP), which can be used only in thread mode.
+
+
Register R13 banks the SP. The SP selection is determined by the bit[1] of the CONTROL register:
+
+
=0 MSP is the current stack pointer. This is also the default SP. The initial value is loaded from the first 32-bit word of the vector table from the program memory.
+
=1 PSP is the current stack pointer. The initial value is undefined.
+
+
Parameters:
+
+
[in]
topOfProcStack
PSP value to set
+
+
+
+
Remarks:
+
Only one of the two SPs is visible at a time.
+
For many applications, the system can completely rely on the MSP.
+
The PSP is normally used in designs with an OS where the stack memory for OS Kernel must be separated from the application code.
CMSIS provides additional debug functions to enlarge the Debug Access. Data can be transmitted via a certain global buffer variable towards the target system.
+
The Cortex-M3 / Cortex-M4 incorporates the Instrumented Trace Macrocell (ITM) that provides together with the Serial Viewer Output (SVO) trace capabilities for the microcontroller system. The ITM has 32 communication channels; two ITM communication channels are used by CMSIS to output the following information:
+
+
ITM Channel 0: implements the ITM_SendChar function which can be used for printf-style output via the debug interface.
+
+
+
ITM Channel 31: is reserved for the RTOS kernel and can be used for kernel awareness debugging.
+
+
Remarks:
+
ITM channels have 4 groups with 8 channels each, whereby each group can be configured for access rights in the Unprivileged level.
+
The ITM channel 0 can be enabled for the user task.
+
ITM channel 31 can be accessed only in Privileged mode from the RTOS kernel itself. The ITM channel 31 has been selected for the RTOS kernel because some kernels may use the Privileged level for program execution.
+
+
+
+
+ITM Debug Support in uVision
+
In a debug session, uVision uses the Debug (printf) Viewer window to display data.
+
Direction: Microcontroller --> uVision:
+
+
Characters received via ITM communication channel 0 are written in a printf-style to the Debug (printf) Viewer window.
+
+
Direction: uVision --> Microcontroller:
+
+
Check if ITM_RxBuffer variable is available (only performed once).
+
Read the character from the Debug (printf) Viewer window.
This function reads the external variable ITM_RxBuffer and checks whether a character is available or not.
+
Returns:
+
=0 - No character available
+
=1 - Character available
+
+
+
+
+
+
+
+
+
+
+
int32_t ITM_ReceiveChar
+
(
+
void
+
)
+
+
+
+
+
+
This function inputs a character via the external variable ITM_RxBuffer. It returns when no debugger is connected that has booked the output. It is blocking when a debugger is connected, but the previously sent character has not been transmitted.
+
Returns:
+
Received character
+
=1 - No character received
+
+
+
+
+
+
+
+
+
+
+
uint32_t ITM_SendChar
+
(
+
uint32_t
+
ch
)
+
+
+
+
+
+
This function transmits a character via the ITM channel 0. It returns when no debugger is connected that has booked the output. It is blocking when a debugger is connected, but the previously sent character has not been transmitted.
ARM provides a template file startup_device for each supported compiler. The file must be adapted by the silicon vendor to include interrupt vectors for all device-specific interrupt handlers. Each interrupt handler is defined as a weak function to an dummy handler. These interrupt handlers can be used directly in application software without being adapted by the programmer.
+
The table below describes the core exception names and their availability in various Cortex-M cores.
+
+
+
Core Exception Name
IRQn Value
M0
M0p
M3
M4
SC000
SC300
Description
+
+
NonMaskableInt_IRQn
-14
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Non Maskable Interrupt
+
+
HardFault_IRQn
-13
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Hard Fault Interrupt
+
+
MemoryManagement_IRQn
-12
+
+
+
+
+
+
+
+
+
Memory Management Interrupt
+
+
BusFault_IRQn
-11
+
+
+
+
+
+
+
+
+
Bus Fault Interrupt
+
+
UsageFault_IRQn
-10
+
+
+
+
+
+
+
+
+
Usage Fault Interrupt
+
+
SVCall_IRQn
-5
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
SV Call Interrupt
+
+
DebugMonitor_IRQn
-4
+
+
+
+
+
+
+
+
+
Debug Monitor Interrupt
+
+
PendSV_IRQn
-2
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Pend SV Interrupt
+
+
SysTick_IRQn
-1
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
System Tick Interrupt
+
+
+For Cortex-M0 and Cortex-M0+
+
The following exception names are fixed and define the start of the vector table for Cortex-M0 variants:
This function decodes an interrupt priority value with the priority group PriorityGroup to preemptive priority value pPreemptPriority and subpriority value pSubPriority. In case of a conflict between priority grouping and available priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
This function enables the specified device-specific interrupt IRQn. IRQn cannot be a negative value.
+
Parameters:
+
+
[in]
IRQn
Interrupt number
+
+
+
+
Remarks:
+
The registers that control the enabling and disabling of interrupts are called SETENA and CLRENA.
+
The number of supported interrupts depends on the implementation of the chip designer and can be read form the Interrupt Controller Type Register (ICTR) in granularities of 32:
+ ICTR[4:0]
This function encodes the priority for an interrupt with the priority group PriorityGroup, preemptive priority value PreemptPriority, and subpriority value SubPriority. In case of a conflict between priority grouping and available priority bits (__NVIC_PRIO_BITS) the smallest possible priority group is set.
This function reads the Interrupt Active Register (NVIC_IABR0-NVIC_IABR7) in NVIC and returns the active bit of the interrupt IRQn.
+
Parameters:
+
+
[in]
IRQn
Interrupt number
+
+
+
+
Returns:
+
=0 Interrupt is not active
+
=1 Interrupt is active, or active and pending
+
+
+
Remarks:
+
not for Cortex-M0 variants.
+
Each external interrupt has an active status bit. When the processor starts the interrupt handler the bit is set to 1 and cleared when the interrupt return is executed.
+
When an ISR is preempted and the processor executes anohter interrupt handler, the previous interrupt is still defined as active.
This function reads the priority for the specified interrupt IRQn. IRQn can can specify any device-specific (external) interrupt, or core (internal) interrupt.
+
The returned priority value is automatically aligned to the implemented priority bits of the microcontroller.
+
Parameters:
+
+
[in]
IRQn
Interrupt number
+
+
+
+
Returns:
Interrupt priority
+
Remarks:
+
Each external interrupt has an associated priority-level register.
Sets the priority for the interrupt specified by IRQn.IRQn can can specify any device-specific (external) interrupt, or core (internal) interrupt. The priority specifies the interrupt priority value, whereby lower values indicate a higher priority. The default priority is 0 for every interrupt. This is the highest possible priority.
+
The priority cannot be set for every core interrupt. HardFault and NMI have a fixed (negative) priority that is higher than any configurable exception or interrupt.
+
Parameters:
+
+
[in]
IRQn
Interrupt Number
+
[in]
priority
Priority to set
+
+
+
+
Remarks:
+
The number of priority levels is configurable and depends on the implementation of the chip designer. To determine the number of bits implemented for interrupt priority-level registers, write 0xFF to one of the priority-level register, then read back the value. For example, if the minimum number of 3 bits have been implemented, the read-back value is 0xE0.
+
Writes to unimplemented bits are ignored.
+
For Cortex-M0:
+
Dynamic switching of interrupt priority levels is not supported. The priority level of an interrupt should not be changed after it has been enabled.
+
Supports 0 to 192 priority levels.
+
Priority-level registers are 2 bit wide, occupying the two MSBs. Each Interrupt Priority Level Register is 1-byte wide.
+
+
+
For Cortex-M3 and Cortex-M4:
+
Dynamic switching of interrupt priority levels is supported.
+
Supports 0 to 255 priority levels.
+
Priority-level registers have a maximum width of 8 bits and a minumum of 3 bits. Each register can be further devided into preempt priority level and subpriority level.
The function sets the priority grouping PriorityGroup using the required unlock sequence. PriorityGroup is assigned to the field PRIGROUP (register AIRCR[10:8]). This field determines the split of group priority from subpriority. Only values from 0..7 are used. In case of a conflict between priority grouping and available priority bits (__NVIC_PRIO_BITS), the smallest possible priority group is set.
This function requests a system reset by setting the SYSRESETREQ flag in the AIRCR register.
+
Remarks:
+
In most microcontroller designs, setting the SYSRESETREQ flag resets the processor and most parts of the system, but should not affect the debug system.
The System Tick Time (SysTick) generates interrupt requests on a regular basis. This allows an OS to carry out context switching to support multiple tasking. For applications that do not require an OS, the SysTick can be used for time keeping, time measurement, or as an interrupt source for tasks that need to be executed regularly.
+
+Code Example
+
The code below shows the usage of the function SysTick_Config() with an LPC1700.
+
#include "LPC17xx.h"
+
+uint32_t msTicks = 0; /* Variable to store millisecond ticks */
+
+
+void SysTick_Handler(void) { /* SysTick interrupt Handler.
+ msTicks++; See startup file startup_LPC17xx.s for SysTick vector */
+}
+
+
+int main (void) {
+ uint32_t returnCode;
+
+ returnCode = SysTick_Config(SystemCoreClock / 1000); /* Configure SysTick to generate an interrupt every millisecond */
+
+ if (returnCode != 0) { /* Check return code for errors */
+ // Error Handling
+ }
+
+ while(1);
+}
+
Function Documentation
+
+
+
+
+
+
uint32_t SysTick_Config
+
(
+
uint32_t
+
ticks
)
+
+
+
+
+
+
Initialises and starts the System Tick Timer and its interrupt. After this call, the SysTick timer creates interrupts with the specified time interval. Counter is in free running mode to generate periodical interrupts.
+
Parameters:
+
+
[in]
ticks
Number of ticks between two interrupts
+
+
+
+
Returns:
0 - success
+
+1 - failure
+
Note:
When #define __Vendor_SysTickConfig is set to 1, the standard function SysTick_Config is excluded. In this case, the file device.h must contain a vendor specific implementation of this function.
The following functions generate specific Cortex-M instructions that cannot be directly accessed by the C/C++ Compiler.
+
Function Documentation
+
+
+
+
+
+
void __CLREX
+
(
+
void
+
)
+
+
+
+
+
+
This function removes the exclusive lock which is created by LDREX [not for Cortex-M0 variants].
+
+
+
+
+
+
+
+
+
uint8_t __CLZ
+
(
+
uint32_t
+
value
)
+
+
+
+
+
+
This function counts the number of leading zeros of a data value [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to count the leading zeros
+
+
+
+
Returns:
number of leading zeros in value
+
+
+
+
+
+
+
+
+
void __DMB
+
(
+
void
+
)
+
+
+
+
+
+
This function ensures the apparent order of the explicit memory operations before and after the instruction, without ensuring their completion.
+
+
+
+
+
+
+
+
+
void __DSB
+
(
+
void
+
)
+
+
+
+
+
+
This function acts as a special kind of Data Memory Barrier. It completes when all explicit memory accesses before this instruction complete.
+
+
+
+
+
+
+
+
+
void __ISB
+
(
+
void
+
)
+
+
+
+
+
+
Instruction Synchronization Barrier flushes the pipeline in the processor, so that all instructions following the ISB are fetched from cache or memory, after the instruction has been completed.
+
+
+
+
+
+
+
+
+
uint8_t __LDREXB
+
(
+
volatile uint8_t *
+
addr
)
+
+
+
+
+
+
This function performs a exclusive LDR command for 8 bit value [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
*addr
Pointer to data
+
+
+
+
Returns:
value of type uint8_t at (*addr)
+
+
+
+
+
+
+
+
+
uint16_t __LDREXH
+
(
+
volatile uint16_t *
+
addr
)
+
+
+
+
+
+
This function performs a exclusive LDR command for 16 bit values [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
*addr
Pointer to data
+
+
+
+
Returns:
value of type uint16_t at (*addr)
+
+
+
+
+
+
+
+
+
uint32_t __LDREXW
+
(
+
volatile uint32_t *
+
addr
)
+
+
+
+
+
+
This function performs a exclusive LDR command for 32 bit values [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
*addr
Pointer to data
+
+
+
+
Returns:
value of type uint32_t at (*addr)
+
+
+
+
+
+
+
+
+
void __NOP
+
(
+
void
+
)
+
+
+
+
+
+
This function does nothing. This instruction can be used for code alignment purposes.
+
+
+
+
+
+
+
+
+
uint32_t __RBIT
+
(
+
uint32_t
+
value
)
+
+
+
+
+
+
This function reverses the bit order of the given value [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to reverse
+
+
+
+
Returns:
Reversed value
+
+
+
+
+
+
+
+
+
uint32_t __REV
+
(
+
uint32_t
+
value
)
+
+
+
+
+
+
This function reverses the byte order in integer value.
+
Parameters:
+
+
[in]
value
Value to reverse
+
+
+
+
Returns:
Reversed value
+
+
+
+
+
+
+
+
+
uint32_t __REV16
+
(
+
uint32_t
+
value
)
+
+
+
+
+
+
This function reverses the byte order in two unsigned short values.
+
Parameters:
+
+
[in]
value
Value to reverse
+
+
+
+
Returns:
Reversed value
+
+
+
+
+
+
+
+
+
int32_t __REVSH
+
(
+
int32_t
+
value
)
+
+
+
+
+
+
This function reverses the byte order in a signed short value with sign extension to integer.
+
Parameters:
+
+
[in]
value
Value to reverse
+
+
+
+
Returns:
Reversed value
+
+
+
+
+
+
+
+
+
uint32_t __ROR
+
(
+
uint32_t
+
value,
+
+
+
+
+
uint32_t
+
shift
+
+
+
+
)
+
+
+
+
+
+
This function rotates a value right by a specified number of bits.
+
Parameters:
+
+
[in]
value
Value to be shifted right
+
[in]
shift
Number of bits in the range [1..31]
+
+
+
+
Returns:
Rotated value
+
+
+
+
+
+
+
+
+
void __SEV
+
(
+
void
+
)
+
+
+
+
+
+
Send Event is a hint instruction. It causes an event to be signaled to the CPU.
+
+
+
+
+
+
+
+
+
uint32_t __SSAT
+
(
+
unint32_t
+
value,
+
+
+
+
+
uint32_t
+
sat
+
+
+
+
)
+
+
+
+
+
+
This function saturates a signed value [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to be saturated
+
[in]
sat
Bit position to saturate to [1..32]
+
+
+
+
Returns:
Saturated value
+
+
+
+
+
+
+
+
+
uint32_t __STREXB
+
(
+
uint8_t
+
value,
+
+
+
+
+
volatile uint8_t *
+
addr
+
+
+
+
)
+
+
+
+
+
+
This function performs a exclusive STR command for 8 bit values [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to store
+
[in]
*addr
Pointer to location
+
+
+
+
Returns:
0 Function succeeded
+
+1 Function failed
+
+
+
+
+
+
+
+
+
uint32_t __STREXH
+
(
+
uint16_t
+
value,
+
+
+
+
+
volatile uint16_t *
+
addr
+
+
+
+
)
+
+
+
+
+
+
This function performs a exclusive STR command for 16 bit values [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to store
+
[in]
*addr
Pointer to location
+
+
+
+
Returns:
0 Function succeeded
+
+1 Function failed
+
+
+
+
+
+
+
+
+
uint32_t __STREXW
+
(
+
uint32_t
+
value,
+
+
+
+
+
volatile uint32_t *
+
addr
+
+
+
+
)
+
+
+
+
+
+
This function performs a exclusive STR command for 32 bit values [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to store
+
[in]
*addr
Pointer to location
+
+
+
+
Returns:
0 Function succeeded
+
+1 Function failed
+
+
+
+
+
+
+
+
+
uint32_t __USAT
+
(
+
uint32_t
+
value,
+
+
+
+
+
uint32_t
+
sat
+
+
+
+
)
+
+
+
+
+
+
This function saturates an unsigned value [not for Cortex-M0 variants].
+
Parameters:
+
+
[in]
value
Value to be saturated
+
[in]
sat
Bit position to saturate to [0..31]
+
+
+
+
Returns:
Saturated value
+
+
+
+
+
+
+
+
+
void __WFE
+
(
+
void
+
)
+
+
+
+
+
+
Wait For Event is a hint instruction that permits the processor to enter a low-power state until an events occurs:
+
+
If the event register is 0, then WFE suspends execution until one of the following events occurs:
+
An exception, unless masked by the exception mask registers or the current priority level.
+
An exception enters the Pending state, if SEVONPEND in the System Control Register is set.
+
A Debug Entry request, if Debug is enabled.
+
An event signaled by a peripheral or another processor in a multiprocessor system using the SEV instruction.
+
+
+
+
+
If the event register is 1, then WFE clears it to 0 and returns immediately.
+
+
+
+
+
+
+
+
+
+
void __WFI
+
(
+
void
+
)
+
+
+
+
+
+
WFI is a hint instruction that suspends execution until one of the following events occurs:
Halfword packing instruction. Combines bits[31:16] of val1 with bits[15:0] of val2 right-shifted with the val3.
+
+
Description
+
Single Instruction Multiple Data (SIMD) extensions are provided only for Cortex-M4 cores to simplify development of application software. SIMD extensions increase the processing capability without materially increasing the power consumption. The SIMD extensions are completely transparent to the operating system (OS), allowing existing OS ports to be used.
+
SIMD Features:
+
+
Simultaneous computation of 2x16-bit or 4x8-bit operands
+
Fractional arithmetic
+
User definable saturation modes (arbitrary word-width)
+
Dual 16x16 multiply-add/subtract 32x32 fractional MAC
+
Simultaneous 8/16-bit select operations
+
Performance up to 3.2 GOPS at 800MHz
+
Performance is achieved with a "near zero" increase in power consumption on a typical implementation
Combine a halfword from one register with a halfword from another register. The second argument can be left-shifted before extraction of the halfword. The registers PC and SP are not allowed as arguments. This instruction does not change the flags.
+
Parameters:
+
+
val1
first 16-bit operands
+
val2
second 16-bit operands
+
val3
value for left-shifting val2. Value range [0..31].
Combines a halfword from one register with a halfword from another register. The second argument can be right-shifted before extraction of the halfword. The registers PC and SP are not allowed as arguments. This instruction does not change the flags.
+
Parameters:
+
+
val1
second 16-bit operands
+
val2
first 16-bit operands
+
val3
value for right-shifting val2. Value range [1..32].
This function enables you to obtain the saturating add of two integers.
+ The Q bit is set if the operation saturates.
+
Parameters:
+
+
val1
first summand of the saturating add operation.
+
val2
second summand of the saturating add operation.
+
+
+
+
Returns:
the saturating addition of val1 and val2.
+
Operation:
res[31:0] = SAT(val1 + SAT(val2 * 2))
+
+
+
+
+
+
+
+
+
+
uint32_t __QADD16
+
(
+
uint32_t
+
val1,
+
+
+
+
+
uint32_t
+
val2
+
+
+
+
)
+
+
+
+
+
+
This function enables you to perform two 16-bit integer arithmetic additions in parallel, saturating the results to the 16-bit signed integer range -215 <= x <= 215 - 1.
+
Parameters:
+
+
val1
first two 16-bit summands.
+
val2
second two 16-bit summands.
+
+
+
+
Returns:
+
the saturated addition of the low halfwords, in the low halfword of the return value.
+
the saturated addition of the high halfwords, in the high halfword of the return value.
+
+
+
The returned results are saturated to the 16-bit signed integer range -215 <= x <= 215 - 1
This function enables you to exchange the halfwords of the one operand, then add the high halfwords and subtract the low halfwords, saturating the results to the 16-bit signed integer range -215 <= x <= 215 - 1.
+
Parameters:
+
+
val1
first operand for the subtraction in the low halfword, and the first operand for the addition in the high halfword.
+
val2
second operand for the subtraction in the high halfword, and the second operand for the addition in the low halfword.
+
+
+
+
Returns:
+
the saturated subtraction of the high halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the saturated addition of the high halfword in the first operand and the low halfword in the second operand, in the high halfword of the return value.
+
+
+
The returned results are saturated to the 16-bit signed integer range -215 <= x <= 215 - 1.
This function enables you to exchange the halfwords of one operand, then subtract the high halfwords and add the low halfwords, saturating the results to the 16-bit signed integer range -215 <= x <= 215 - 1.
+
Parameters:
+
+
val1
first operand for the addition in the low halfword, and the first operand for the subtraction in the high halfword.
+
val2
second operand for the addition in the high halfword, and the second operand for the subtraction in the low halfword.
+
+
+
+
Returns:
+
the saturated addition of the low halfword of the first operand and the high halfword of the second operand, in the low halfword of the return value.
+
the saturated subtraction of the low halfword of the second operand from the high halfword of the first operand, in the high halfword of the return value.
+
+
+
The returned results are saturated to the 16-bit signed integer range -215 <= x <= 215 - 1.
This function enables you to obtain the saturating subtraction of two integers.
+ The Q bit is set if the operation saturates.
+
Parameters:
+
+
val1
minuend of the saturating subtraction operation.
+
val2
subtrahend of the saturating subtraction operation.
+
+
+
+
Returns:
the saturating subtraction of val1 and val2.
+
Operation:
res[31:0] = SAT(val1 - SAT(val2 * 2))
+
+
+
+
+
+
+
+
+
+
uint32_t __QSUB16
+
(
+
uint32_t
+
val1,
+
+
+
+
+
uint32_t
+
val2
+
+
+
+
)
+
+
+
+
+
+
This function enables you to perform two 16-bit integer subtractions, saturating the results to the 16-bit signed integer range -215 <= x <= 215 - 1.
+
Parameters:
+
+
val1
first two 16-bit operands.
+
val2
second two 16-bit operands.
+
+
+
+
Returns:
+
the saturated subtraction of the low halfword in the second operand from the low halfword in the first operand, in the low halfword of the returned result.
+
the saturated subtraction of the high halfword in the second operand from the high halfword in the first operand, in the high halfword of the returned result.
+
+
+
The returned results are saturated to the 16-bit signed integer range -215 <= x <= 215 - 1.
This function inserts an SASX instruction into the instruction stream generated by the compiler. It enables you to exchange the halfwords of the second operand, add the high halfwords and subtract the low halfwords.
+ The GE bits in the APRS are set according to the results.
+
Parameters:
+
+
val1
first operand for the subtraction in the low halfword, and the first operand for the addition in the high halfword.
+
val2
second operand for the subtraction in the high halfword, and the second operand for the addition in the low halfword.
+
+
+
+
Returns:
+
the subtraction of the high halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the addition of the high halfword in the first operand and the low halfword in the second operand, in the high halfword of the return value.
+
+
+
Each bit in APSR.GE is set or cleared for each byte in the return value, depending on the results of the operation.
This function inserts a SEL instruction into the instruction stream generated by the compiler. It enables you to select bytes from the input parameters, whereby the bytes that are selected depend upon the results of previous SIMD instruction function. The results of previous SIMD instruction function are represented by the Greater than or Equal flags in the Application Program Status Register (APSR). The __SEL function works equally well on both halfword and byte operand function results. This is because halfword operand operations set two (duplicate) GE bits per value.
+
Parameters:
+
+
val1
four selectable 8-bit values.
+
val2
four selectable 8-bit values.
+
+
+
+
Returns:
The function selects bytes from the input parameters and returns them in the return value, res, according to the following criteria:
+
if APSR.GE[0] == 1 then res[7:0] = val1[7:0] else res[7:0] = val2[7:0]
+
if APSR.GE[1] == 1 then res[15:8] = val1[15:8] else res[15:8] = val2[15:8]
+
if APSR.GE[2] == 1 then res[23:16] = val1[23:16] else res[23:16] = val2[23:16]
+
if APSR.GE[3] == 1 then res[31;24] = val1[31:24] else res = val2[31:24]
+
+
+
+
+
+
+
+
+
+
+
uint32_t __SHADD16
+
(
+
uint32_t
+
val1,
+
+
+
+
+
uint32_t
+
val2
+
+
+
+
)
+
+
+
+
+
+
This function enables you to perform two signed 16-bit integer additions, halving the results.
+
Parameters:
+
+
val1
first two 16-bit summands.
+
val2
second two 16-bit summands.
+
+
+
+
Returns:
+
the halved addition of the low halfwords, in the low halfword of the return value.
+
the halved addition of the high halfwords, in the high halfword of the return value.
This function enables you to exchange the two halfwords of one operand, perform one signed 16-bit integer addition and one signed 16-bit subtraction, and halve the results.
+
Parameters:
+
+
val1
first 16-bit operands.
+
val2
second 16-bit operands.
+
+
+
+
Returns:
+
the halved subtraction of the high halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the halved subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
This function enables you to exchange the two halfwords of one operand, perform one signed 16-bit integer subtraction and one signed 16-bit addition, and halve the results.
+
Parameters:
+
+
val1
first 16-bit operands.
+
val2
second 16-bit operands.
+
+
+
+
Returns:
+
the halved addition of the low halfword in the first operand and the high halfword in the second operand, in the low halfword of the return value.
+
the halved subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
This function enables you to perform two signed 16-bit integer subtractions, halving the results.
+
Parameters:
+
+
val1
first two 16-bit operands.
+
val2
second two 16-bit operands.
+
+
+
+
Returns:
+
the halved subtraction of the low halfword in the second operand from the low halfword in the first operand, in the low halfword of the returned result.
+
the halved subtraction of the high halfword in the second operand from the high halfword in the first operand, in the high halfword of the returned result.
This function enables you to perform two signed 16-bit multiplications, adding both results to a 32-bit accumulate operand.
+ The Q bit is set if the addition overflows. Overflow cannot occur during the multiplications.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the product of each multiplication added to the accumulate value, as a 32-bit integer.
This function enables you to perform two signed 16-bit multiplications with exchanged halfwords of the second operand, adding both results to a 32-bit accumulate operand.
+ The Q bit is set if the addition overflows. Overflow cannot occur during the multiplications.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the product of each multiplication with exchanged halfwords of the second operand added to the accumulate value, as a 32-bit integer.
This function enables you to perform two signed 16-bit multiplications, adding both results to a 64-bit accumulate operand. Overflow is only possible as a result of the 64-bit addition. This overflow is not detected if it occurs. Instead, the result wraps around modulo264.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the product of each multiplication added to the accumulate value.
This function enables you to exchange the halfwords of the second operand, and perform two signed 16-bit multiplications, adding both results to a 64-bit accumulate operand. Overflow is only possible as a result of the 64-bit addition. This overflow is not detected if it occurs. Instead, the result wraps around modulo264.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the product of each multiplication added to the accumulate value.
This function enables you to perform two 16-bit signed multiplications, take the difference of the products, subtracting the high halfword product from the low halfword product, and add the difference to a 32-bit accumulate operand.
+ The Q bit is set if the accumulation overflows. Overflow cannot occur during the multiplications or the subtraction.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the difference of the product of each multiplication, added to the accumulate value.
This function enables you to exchange the halfwords in the second operand, then perform two 16-bit signed multiplications. The difference of the products is added to a 32-bit accumulate operand.
+ The Q bit is set if the addition overflows. Overflow cannot occur during the multiplications or the subtraction.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the difference of the product of each multiplication, added to the accumulate value.
This function It enables you to perform two 16-bit signed multiplications, take the difference of the products, subtracting the high halfword product from the low halfword product, and add the difference to a 64-bit accumulate operand. Overflow cannot occur during the multiplications or the subtraction. Overflow can occur as a result of the 64-bit addition, and this overflow is not detected. Instead, the result wraps round to modulo264.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the difference of the product of each multiplication, added to the accumulate value.
This function enables you to exchange the halfwords of the second operand, perform two 16-bit multiplications, adding the difference of the products to a 64-bit accumulate operand. Overflow cannot occur during the multiplications or the subtraction. Overflow can occur as a result of the 64-bit addition, and this overflow is not detected. Instead, the result wraps round to modulo264.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
val3
accumulate value.
+
+
+
+
Returns:
the difference of the product of each multiplication, added to the accumulate value.
This function enables you to perform two 16-bit signed multiplications with exchanged halfwords of the second operand, adding the products together.
+ The Q bit is set if the addition overflows.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
+
+
+
Returns:
the sum of the products of the two 16-bit signed multiplications with exchanged halfwords of the second operand.
This function enables you to perform two 16-bit signed multiplications, taking the difference of the products by subtracting the high halfword product from the low halfword product.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
+
+
+
Returns:
the difference of the products of the two 16-bit signed multiplications.
This function enables you to perform two 16-bit signed multiplications, subtracting one of the products from the other. The halfwords of the second operand are exchanged before performing the arithmetic. This produces top * bottom and bottom * top multiplication.
+
Parameters:
+
+
val1
first 16-bit operands for each multiplication.
+
val2
second 16-bit operands for each multiplication.
+
+
+
+
Returns:
the difference of the products of the two 16-bit signed multiplications.
This function enables you to saturate two signed 16-bit values to a selected signed range.
+ The Q bit is set if either operation saturates.
+
Parameters:
+
+
val1
two signed 16-bit values to be saturated.
+
val2
bit position for saturation, an integral constant expression in the range 1 to 16.
+
+
+
+
Returns:
the sum of the absolute differences of the following bytes, added to the accumulation value:
+
the signed saturation of the low halfword in val1, saturated to the bit position specified in val2 and returned in the low halfword of the return value.
+
the signed saturation of the high halfword in val1, saturated to the bit position specified in val2 and returned in the high halfword of the return value.
+
+
+
Operation:
Saturate halfwords in val1 to the signed range specified by the bit position in val2
+
+
+
+
+
+
+
+
+
+
uint32_t __SSAX
+
(
+
uint32_t
+
val1,
+
+
+
+
+
uint32_t
+
val2
+
+
+
+
)
+
+
+
+
+
+
This function enables you to exchange the two halfwords of one operand and perform one 16-bit integer subtraction and one 16-bit addition.
+ The GE bits in the APSR are set according to the results.
+
Parameters:
+
+
val1
first operand for the addition in the low halfword, and the first operand for the subtraction in the high halfword.
+
val2
second operand for the addition in the high halfword, and the second operand for the subtraction in the low halfword.
+
+
+
+
Returns:
+
the addition of the low halfword in the first operand and the high halfword in the second operand, in the low halfword of the return value.
+
the subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
+
+
+
Each bit in APSR.GE is set or cleared for each byte in the return value, depending on the results of the operation.
This function enables you to extract two 8-bit values from the second operand (at bit positions [7:0] and [23:16]), sign-extend them to 16-bits each, and add the results to the first operand.
+
Parameters:
+
+
val1
values added to the zero-extended to 16-bit values.
+
val2
two 8-bit values to be extracted and zero-extended.
+
+
+
+
Returns:
the addition of val1 and val2, where the 8-bit values in val2[7:0] and val2[23:16] have been extracted and sign-extended prior to the addition.
This function enables you to exchange the two halfwords of the second operand, add the high halfwords and subtract the low halfwords.
+ The GE bits in the APSR are set according to the results.
+
Parameters:
+
+
val1
first operand for the subtraction in the low halfword, and the first operand for the addition in the high halfword.
+
val2
second operand for the subtraction in the high halfword and the second operand for the addition in the low halfword.
+
+
+
+
Returns:
+
the subtraction of the high halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the addition of the high halfword in the first operand and the low halfword in the second operand, in the high halfword of the return value.
+
+
+
Each bit in APSR.GE is set or cleared for each byte in the return value, depending on the results of the operation.
+
If res is the return value, then:
+
if res[15:0] >= 0 then APSR.GE[1:0] = 11 else 00
+
if res[31:16] >= 0x10000 then APSR.GE[3:2] = 11 else 00
This function enables you to exchange the halfwords of the second operand, subtract the high halfwords and add the low halfwords, halving the results.
+
Parameters:
+
+
val1
first operand for the addition in the low halfword, and the first operand for the subtraction in the high halfword.
+
val2
second operand for the addition in the high halfword, and the second operand for the subtraction in the low halfword.
+
+
+
+
Returns:
+
the halved addition of the high halfword in the second operand and the low halfword in the first operand, in the low halfword of the return value.
+
the halved subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
This function enables you to perform two unsigned 16-bit integer subtractions, halving the results.
+
Parameters:
+
+
val1
first two 16-bit operands.
+
val2
second two 16-bit operands.
+
+
+
+
Returns:
+
the halved subtraction of the low halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the halved subtraction of the high halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
This function enables you to perform two unsigned 16-bit integer additions, saturating the results to the 16-bit unsigned integer range 0 < x < 216 - 1.
+
Parameters:
+
+
val1
first two 16-bit summands.
+
val2
second two 16-bit summands.
+
+
+
+
Returns:
+
the addition of the low halfword in the first operand and the low halfword in the second operand, in the low halfword of the return value.
+
the addition of the high halfword in the first operand and the high halfword in the second operand, in the high halfword of the return value.
+
+
+
The results are saturated to the 16-bit unsigned integer range 0 < x < 216 - 1.
This function enables you to exchange the halfwords of the second operand and perform one unsigned 16-bit integer addition and one unsigned 16-bit subtraction, saturating the results to the 16-bit unsigned integer range 0 <= x <= 216 - 1.
+
Parameters:
+
+
val1
first two 16-bit operands.
+
val2
second two 16-bit operands.
+
+
+
+
Returns:
+
the subtraction of the high halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
+
+
+
The results are saturated to the 16-bit unsigned integer range 0 <= x <= 216 - 1.
This function enables you to exchange the halfwords of the second operand and perform one unsigned 16-bit integer subtraction and one unsigned 16-bit addition, saturating the results to the 16-bit unsigned integer range 0 <= x <= 216 - 1.
+
Parameters:
+
+
val1
first 16-bit operand for the addition in the low halfword, and the first 16-bit operand for the subtraction in the high halfword.
+
val2
second 16-bit halfword for the addition in the high halfword, and the second 16-bit halfword for the subtraction in the low halfword.
+
+
+
+
Returns:
+
the addition of the low halfword in the first operand and the high halfword in the second operand, in the low halfword of the return value.
+
the subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
+
+
+
The results are saturated to the 16-bit unsigned integer range 0 <= x <= 216 - 1.
This function enables you to perform two unsigned 16-bit integer subtractions, saturating the results to the 16-bit unsigned integer range 0 < x < 216 - 1.
+
Parameters:
+
+
val1
first two 16-bit operands for each subtraction.
+
val2
second two 16-bit operands for each subtraction.
+
+
+
+
Returns:
+
the subtraction of the low halfword in the second operand from the low halfword in the first operand, in the low halfword of the return value.
+
the subtraction of the high halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
+
+
+
The results are saturated to the 16-bit unsigned integer range 0 < x < 216 - 1.
This function enables you to perform four unsigned 8-bit integer subtractions, saturating the results to the 8-bit unsigned integer range 0 < x < 28 - 1.
+
Parameters:
+
+
val1
first four 8-bit operands.
+
val2
second four 8-bit operands.
+
+
+
+
Returns:
+
the subtraction of the first byte in the second operand from the first byte in the first operand, in the first bytes of the return value.
+
the subtraction of the second byte in the second operand from the second byte in the first operand, in the second byte of the return value.
+
the subtraction of the third byte in the second operand from the third byte in the first operand, in the third byte of the return value.
+
the subtraction of the fourth byte in the second operand from the fourth byte in the first operand, in the fourth byte of the return value.
+
+
+
The results are saturated to the 8-bit unsigned integer range 0 < x < 28 - 1.
This function enables you to perform four unsigned 8-bit subtractions, and add the absolute values of the differences together, returning the result as a single unsigned integer.
+
Parameters:
+
+
val1
first four 8-bit operands for the subtractions.
+
val2
second four 8-bit operands for the subtractions.
+
+
+
+
Returns:
+
the subtraction of the first byte in the second operand from the first byte in the first operand.
+
the subtraction of the second byte in the second operand from the second byte in the first operand.
+
the subtraction of the third byte in the second operand from the third byte in the first operand.
+
the subtraction of the fourth byte in the second operand from the fourth byte in the first operand.
This function enables you to saturate two signed 16-bit values to a selected unsigned range.
+ The Q bit is set if either operation saturates.
+
Parameters:
+
+
val1
two 16-bit values that are to be saturated.
+
val2
bit position for saturation, and must be an integral constant expression in the range 0 to 15.
+
+
+
+
Returns:
the saturation of the two signed 16-bit values, as non-negative values.
+
the saturation of the low halfword in val1, saturated to the bit position specified in val2 and returned in the low halfword of the return value.
+
the saturation of the high halfword in val1, saturated to the bit position specified in val2 and returned in the high halfword of the return value.
+
+
+
Operation:
Saturate halfwords in val1 to the unsigned range specified by the bit position in val2
+
+
+
+
+
+
+
+
+
+
uint32_t __USAX
+
(
+
uint32_t
+
val1,
+
+
+
+
+
uint32_t
+
val2
+
+
+
+
)
+
+
+
+
+
+
This function enables you to exchange the halfwords of the second operand, subtract the high halfwords and add the low halfwords.
+ The GE bits in the APSR are set according to the results.
+
Parameters:
+
+
val1
first operand for the addition in the low halfword, and the first operand for the subtraction in the high halfword.
+
val2
second operand for the addition in the high halfword, and the second operand for the subtraction in the low halfword.
+
+
+
+
Returns:
+
the addition of the low halfword in the first operand and the high halfword in the second operand, in the low halfword of the return value.
+
the subtraction of the low halfword in the second operand from the high halfword in the first operand, in the high halfword of the return value.
+
+
+
Each bit in APSR.GE is set or cleared for each byte in the return value, depending on the results of the operation.
+
If res is the return value, then:
+
if res[15:0] >= 0x10000 then APSR.GE[1:0] = 11 else 00
This function enables you to extract two 8-bit values from one operand, zero-extend them to 16 bits each, and add the results to two 16-bit values from another operand.
+
Parameters:
+
+
val1
value added to the zero-extended to 16-bit values.
+
val2
two 8-bit values to be extracted and zero-extended.
+
+
+
+
Returns:
the 8-bit values in val2, zero-extended to 16-bit values and added to val1.
+
Operation:
res[15:0] = ZeroExt(val2[7:0] to 16 bits) + val1[15:0]
+ res[31:16] = ZeroExt(val2[31:16] to 16 bits) + val1[31:16]
+
+
+
+
+
+
+
+
+
+
uint32_t __UXTB16
+
(
+
uint32_t
+
val
)
+
+
+
+
+
+
This function enables you to extract two 8-bit values from an operand and zero-extend them to 16 bits each.
+
Parameters:
+
+
val
two 8-bit values in val[7:0] and val[23:16] to be sign-extended.
+
+
+
+
Returns:
the 8-bit values zero-extended to 16-bit values.
+
zero-extended value of val[7:0] in the low halfword of the return value.
+
zero-extended value of val[23:16] in the high halfword of the return value.
Describes naming conventions, requirements, and optional features for accessing peripherals.
+More...
+
Each peripheral provides a data type definition with a name that is composed of a prefix <device abbreviation>_ and the <peripheral name>_, for example LPC_UART for the device LPC and the peripheral UART. The intention is to avoid name collisions caused by short names. If more peripherals exist of the same type, identifiers have a postfix consisting of a digit or letter, for example LPC_UART0, LPC_UART1.
+
+
The data type definition uses the standard C data types from the ANSI C header file <stdint.h>. IO Type Qualifiers are used to specify the access to peripheral variables. IO Type Qualifiers are indented to be used for automatic generation of debug information of peripheral registers and are defined as shown below:
+
The following typedef is an example for a UART. <device abbreviation>_UART_TypeDef: defines the generic register layout for all UART channels in a device.
+
To access the registers of the UART defined above, pointers to a register structure are defined. In this example <device abbreviation>_UART# are two pointers to UARTs defined with above register structure.
+
The registers in the various UARTs can now be referred in the user code as shown below:
+
LPC_UART1->DR // is the data register of UART1.
+
+
+
+
+Minimal Requirements
+
To access the peripheral registers and related function in a device, the files device.h and core_cm#.h define as a minimum:
+
+
+
+
The Register Layout Typedef for each peripheral that defines all register names. RESERVED is used to introduce space into the structure for adjusting the addresses of the peripheral registers.
+
+ Example:
typedefstruct
+{
+ __IO uint32_t CTRL; /* Offset: 0x000 (R/W) SysTick Control and Status Register */
+ __IO uint32_t LOAD; /* Offset: 0x004 (R/W) SysTick Reload Value Register */
+ __IO uint32_t VAL; /* Offset: 0x008 (R/W) SysTick Current Value Register */
+ __I uint32_t CALIB; /* Offset: 0x00C (R/ ) SysTick Calibration Register */
+} SysTick_Type;
+
+
+
+
Base Address for each peripheral (in case of multiple peripherals that use the same register layout typedef multiple base addresses are defined).
+
+ Example:
Access Definitions for each peripheral. In case of multiple peripherals that are using the same register layout typdef, multiple access definitions exist (LPC_UART0, LPC_UART2).
+
+ Example:
These definitions allow accessing peripheral registers with simple assignments.
+
Example:
+
+
SysTick->CTRL = 0;
+
+
+Optional Features
+
Optionally, the file device.h may define:
+
+
#define constants, which simplify access to peripheral registers. These constants define bit-positions or other specific patterns that are required for programming peripheral registers. The identifiers start with <device abbreviation>_ and <peripheral name>_. It is recommended to use CAPITAL letters for such #define constants.
+
+
+
More complex functions (i.e. status query before a sending register is accessed). Again, these functions start with <device abbreviation>_ and <peripheral name>_.
ARM provides a template file system_device.c that must be adapted by the silicon vendor to match their actual device. As a minimum requirement, this file must provide:
+
+
A device-specific system configuration function, SystemInit().
+
A global variable that contains the system frequency, SystemCoreClock.
+
+
The file configures the device and, typically, initializes the oscillator (PLL) that is part of the microcontroller device. This file might export other functions or variables that provide a more flexible configuration of the microcontroller system.
#include "LPC17xx.h"
+
+uint32_t coreClock_1 = 0; /* Variables to store core clock values */
+uint32_t coreClock_2 = 0;
+
+
+int main (void) {
+
+ coreClock_1 = SystemCoreClock; /* Store value of predefined SystemCoreClock */
+
+ SystemCoreClockUpdate(); /* Update SystemCoreClock according to register settings */
+
+ coreClock_2 = SystemCoreClock; /* Store value of calculated SystemCoreClock */
+
+ if (coreClock_2 != coreClock_1) { /* Without changing the clock setting both core clock values should be the same */
+ // Error Handling
+ }
+
+ while(1);
+}
+
Holds the system core clock, which is the system clock frequency supplied to the SysTick timer and the processor core clock. This variable can be used by debuggers to query the frequency of the debug timer or to configure the trace clock speed.
+
Attention:
Compilers must be configured to avoid removing this variable in case the application program is not using it. Debugging systems require the variable to be physically present in memory so that it can be examined to configure the debugger.
+
+
+
+
Function Documentation
+
+
+
+
+
+
void SystemCoreClockUpdate
+
(
+
void
+
)
+
+
+
+
+
+
Updates the variable SystemCoreClock and must be called whenever the core clock is changed during program execution. The function evaluates the clock register settings and calculates the current core clock.
+
+
+
+
+
+
+
+
+
void SystemInit
+
(
+
void
+
)
+
+
+
+
+
+
Initializes the microcontroller system. Typically, this function configures the oscillator (PLL) that is part of the microcontroller device. For systems with a variable clock speed, it updates the variable SystemCoreClock. SystemInit is called from the file startup_device.
CMSIS-CORE implements the basic run-time system for a Cortex-M device and gives the user access to the processor core and the device peripherals. In detail it defines:
+
+
Hardware Abstraction Layer (HAL) for Cortex-M processor registers with standardized definitions for the SysTick, NVIC, System Control Block registers, MPU registers, FPU registers, and core access functions.
+
System exception names to interface to system exceptions without having compatibility issues.
+
Methods to organize header files that makes it easy to learn new Cortex-M microcontroller products and improve software portability. This includes naming conventions for device-specific interrupts.
+
Methods for system initialization to be used by each MCU vendor. For example, the standardized SystemInit() function is essential for configuring the clock system of the device.
+
Intrinsic functions used to generate CPU instructions that are not supported by standard C functions.
+
A variable to determine the system clock frequency which simplifies the setup the SysTick timer.
+
+
+
This chapter provides details about the CMSIS-CORE and contains the following sections:
The Cortex-M Reference Manuals are generic user guides for devices that implement the various ARM Cortex-M processors. These manuals contain the programmers model and detailed information about the core peripherals.