/////////////////////////////////////////////////////////////////////////////// // BSD 3-Clause License // // Copyright (c) 2018-2020, The Regents of the University of California // All rights reserved. // // Redistribution and use in source and binary forms, with or without // modification, are permitted provided that the following conditions are met: // // * Redistributions of source code must retain the above copyright notice, this // list of conditions and the following disclaimer. // // * Redistributions in binary form must reproduce the above copyright notice, // this list of conditions and the following disclaimer in the documentation // and/or other materials provided with the distribution. // // * Neither the name of the copyright holder nor the names of its // contributors may be used to endorse or promote products derived from // this software without specific prior written permission. // // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE // ARE // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /////////////////////////////////////////////////////////////////////////////// /* Fast Fourier/Cosine/Sine Transform dimension :two data length :power of 2 decimation :frequency radix :split-radix, row-column data :inplace table :use functions cdft2d: Complex Discrete Fourier Transform rdft2d: Real Discrete Fourier Transform ddct2d: Discrete Cosine Transform ddst2d: Discrete Sine Transform function prototypes void cdft2d(int, int, int, float **, float *, int *, float *); void rdft2d(int, int, int, float **, float *, int *, float *); void rdft2dsort(int, int, int, float **); void ddct2d(int, int, int, float **, float *, int *, float *); void ddst2d(int, int, int, float **, float *, int *, float *); necessary package fftsg.c : 1D-FFT package macro definitions USE_FFT2D_PTHREADS : default=not defined FFT2D_MAX_THREADS : must be 2^N, default=4 FFT2D_THREADS_BEGIN_N : default=65536 USE_FFT2D_WINTHREADS : default=not defined FFT2D_MAX_THREADS : must be 2^N, default=4 FFT2D_THREADS_BEGIN_N : default=131072 -------- Complex DFT (Discrete Fourier Transform) -------- [definition] X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * exp(2*pi*i*j1*k1/n1) * exp(2*pi*i*j2*k2/n2), 0<=k1 X[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 x[j1][j2] * exp(-2*pi*i*j1*k1/n1) * exp(-2*pi*i*j2*k2/n2), 0<=k1 ip[0] = 0; // first time only cdft2d(n1, 2*n2, 1, a, t, ip, w); ip[0] = 0; // first time only cdft2d(n1, 2*n2, -1, a, t, ip, w); [parameters] n1 :data length (int) n1 >= 1, n1 = power of 2 2*n2 :data length (int) n2 >= 1, n2 = power of 2 a[0...n1-1][0...2*n2-1] :input/output data (float **) input data a[j1][2*j2] = Re(x[j1][j2]), a[j1][2*j2+1] = Im(x[j1][j2]), 0<=j1= 8*n1, if single thread, length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, t is dynamically allocated, if t == NULL. ip[0...*] :work area for bit reversal (int *) length of ip >= 2+sqrt(n) (n = max(n1, n2)) ip[0],ip[1] are pointers of the cos/sin table. w[0...*] :cos/sin table (float *) length of w >= max(n1/2, n2/2) w[],ip[] are initialized if ip[0] == 0. [remark] Inverse of cdft2d(n1, 2*n2, -1, a, t, ip, w); is cdft2d(n1, 2*n2, 1, a, t, ip, w); for (j1 = 0; j1 <= n1 - 1; j1++) { for (j2 = 0; j2 <= 2 * n2 - 1; j2++) { a[j1][j2] *= 1.0 / n1 / n2; } } . -------- Real DFT / Inverse of Real DFT -------- [definition] RDFT R[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2), 0<=k1 IRDFT (excluding scale) a[k1][k2] = (1/2) * sum_j1=0^n1-1 sum_j2=0^n2-1 (R[j1][j2] * cos(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2) + I[j1][j2] * sin(2*pi*j1*k1/n1 + 2*pi*j2*k2/n2)), 0<=k1 ip[0] = 0; // first time only rdft2d(n1, n2, 1, a, t, ip, w); ip[0] = 0; // first time only rdft2d(n1, n2, -1, a, t, ip, w); [parameters] n1 :data length (int) n1 >= 2, n1 = power of 2 n2 :data length (int) n2 >= 2, n2 = power of 2 a[0...n1-1][0...n2-1] :input/output data (float **) output data a[k1][2*k2] = R[k1][k2] = R[n1-k1][n2-k2], a[k1][2*k2+1] = I[k1][k2] = -I[n1-k1][n2-k2], 0 input data a[j1][2*j2] = R[j1][j2] = R[n1-j1][n2-j2], a[j1][2*j2+1] = I[j1][j2] = -I[n1-j1][n2-j2], 0= 8*n1, if single thread, length of t >= 8*n1*FFT2D_MAX_THREADS, if multi threads, t is dynamically allocated, if t == NULL. ip[0...*] :work area for bit reversal (int *) length of ip >= 2+sqrt(n) (n = max(n1, n2/2)) ip[0],ip[1] are pointers of the cos/sin table. w[0...*] :cos/sin table (float *) length of w >= max(n1/2, n2/4) + n2/4 w[],ip[] are initialized if ip[0] == 0. [remark] Inverse of rdft2d(n1, n2, 1, a, t, ip, w); is rdft2d(n1, n2, -1, a, t, ip, w); for (j1 = 0; j1 <= n1 - 1; j1++) { for (j2 = 0; j2 <= n2 - 1; j2++) { a[j1][j2] *= 2.0 / n1 / n2; } } . -------- DCT (Discrete Cosine Transform) / Inverse of DCT -------- [definition] IDCT (excluding scale) C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * cos(pi*j1*(k1+1/2)/n1) * cos(pi*j2*(k2+1/2)/n2), 0<=k1 DCT C[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * cos(pi*(j1+1/2)*k1/n1) * cos(pi*(j2+1/2)*k2/n2), 0<=k1 ip[0] = 0; // first time only ddct2d(n1, n2, 1, a, t, ip, w); ip[0] = 0; // first time only ddct2d(n1, n2, -1, a, t, ip, w); [parameters] n1 :data length (int) n1 >= 2, n1 = power of 2 n2 :data length (int) n2 >= 2, n2 = power of 2 a[0...n1-1][0...n2-1] :input/output data (float **) output data a[k1][k2] = C[k1][k2], 0<=k1= 4*n1, if single thread, length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, t is dynamically allocated, if t == NULL. ip[0...*] :work area for bit reversal (int *) length of ip >= 2+sqrt(n) (n = max(n1/2, n2/2)) ip[0],ip[1] are pointers of the cos/sin table. w[0...*] :cos/sin table (float *) length of w >= max(n1*3/2, n2*3/2) w[],ip[] are initialized if ip[0] == 0. [remark] Inverse of ddct2d(n1, n2, -1, a, t, ip, w); is for (j1 = 0; j1 <= n1 - 1; j1++) { a[j1][0] *= 0.5; } for (j2 = 0; j2 <= n2 - 1; j2++) { a[0][j2] *= 0.5; } ddct2d(n1, n2, 1, a, t, ip, w); for (j1 = 0; j1 <= n1 - 1; j1++) { for (j2 = 0; j2 <= n2 - 1; j2++) { a[j1][j2] *= 4.0 / n1 / n2; } } . -------- DST (Discrete Sine Transform) / Inverse of DST -------- [definition] IDST (excluding scale) S[k1][k2] = sum_j1=1^n1 sum_j2=1^n2 A[j1][j2] * sin(pi*j1*(k1+1/2)/n1) * sin(pi*j2*(k2+1/2)/n2), 0<=k1 DST S[k1][k2] = sum_j1=0^n1-1 sum_j2=0^n2-1 a[j1][j2] * sin(pi*(j1+1/2)*k1/n1) * sin(pi*(j2+1/2)*k2/n2), 0 ip[0] = 0; // first time only ddst2d(n1, n2, 1, a, t, ip, w); ip[0] = 0; // first time only ddst2d(n1, n2, -1, a, t, ip, w); [parameters] n1 :data length (int) n1 >= 2, n1 = power of 2 n2 :data length (int) n2 >= 2, n2 = power of 2 a[0...n1-1][0...n2-1] :input/output data (float **) input data a[j1][j2] = A[j1][j2], 0 output data a[k1][k2] = S[k1][k2], 0= 4*n1, if single thread, length of t >= 4*n1*FFT2D_MAX_THREADS, if multi threads, t is dynamically allocated, if t == NULL. ip[0...*] :work area for bit reversal (int *) length of ip >= 2+sqrt(n) (n = max(n1/2, n2/2)) ip[0],ip[1] are pointers of the cos/sin table. w[0...*] :cos/sin table (float *) length of w >= max(n1*3/2, n2*3/2) w[],ip[] are initialized if ip[0] == 0. [remark] Inverse of ddst2d(n1, n2, -1, a, t, ip, w); is for (j1 = 0; j1 <= n1 - 1; j1++) { a[j1][0] *= 0.5; } for (j2 = 0; j2 <= n2 - 1; j2++) { a[0][j2] *= 0.5; } ddst2d(n1, n2, 1, a, t, ip, w); for (j1 = 0; j1 <= n1 - 1; j1++) { for (j2 = 0; j2 <= n2 - 1; j2++) { a[j1][j2] *= 4.0 / n1 / n2; } } . */ #include #include #define fft2d_alloc_error_check(p) \ { \ if((p) == NULL) { \ fprintf(stderr, "fft2d memory allocation error\n"); \ exit(1); \ } \ } #ifdef USE_FFT2D_PTHREADS #define USE_FFT2D_THREADS #ifndef FFT2D_MAX_THREADS #define FFT2D_MAX_THREADS 4 #endif #ifndef FFT2D_THREADS_BEGIN_N #define FFT2D_THREADS_BEGIN_N 65536 #endif #include #define fft2d_thread_t pthread_t #define fft2d_thread_create(thp, func, argp) \ { \ if(pthread_create(thp, NULL, func, (void *)(argp)) != 0) { \ fprintf(stderr, "fft2d thread error\n"); \ exit(1); \ } \ } #define fft2d_thread_wait(th) \ { \ if(pthread_join(th, NULL) != 0) { \ fprintf(stderr, "fft2d thread error\n"); \ exit(1); \ } \ } #endif /* USE_FFT2D_PTHREADS */ #ifdef USE_FFT2D_WINTHREADS #define USE_FFT2D_THREADS #ifndef FFT2D_MAX_THREADS #define FFT2D_MAX_THREADS 4 #endif #ifndef FFT2D_THREADS_BEGIN_N #define FFT2D_THREADS_BEGIN_N 131072 #endif #define NOMINMAX #include #define fft2d_thread_t HANDLE #define fft2d_thread_create(thp, func, argp) \ { \ DWORD thid; \ *(thp) = CreateThread(NULL, 0, (LPTHREAD_START_ROUTINE)(func), \ (LPVOID)(argp), 0, &thid); \ if(*(thp) == 0) { \ fprintf(stderr, "fft2d thread error\n"); \ exit(1); \ } \ } #define fft2d_thread_wait(th) \ { \ WaitForSingleObject(th, INFINITE); \ CloseHandle(th); \ } #endif /* USE_FFT2D_WINTHREADS */ #include "nextpnr_namespaces.h" NEXTPNR_NAMESPACE_BEGIN void ddct(int n, int isgn, float *a, int *ip, float *w); void ddst(int n, int isgn, float *a, int *ip, float *w); void cdft(int n, int isgn, float *a, int *ip, float *w); void *cdft2d_th(void *p); void *ddxt2d0_th(void *p); void *ddxt2d_th(void *p); void rdft(int n, int isgn, float *a, int *ip, float *w); void makewt(int nw, int *ip, float *w); void cdft2d_sub(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w); void makect(int nc, int *ip, float *c); void rdft2d_sub(int n1, int isgn, float **a); void ddxt2d_sub(int n1, int n2, int ics, int isgn, float **a, float *t, int *ip, float *w); #ifdef USE_FFT2D_THREADS void xdft2d0_subth(int n1, int n2, int icr, int isgn, float **a, int *ip, float *w); void cdft2d_subth(int n1, int n2, int isgn, float **a, float *t, int *ip, float *w); void ddxt2d0_subth(int n1, int n2, int ics, int isgn, float **a, int *ip, float *w); void ddxt2d_subth(int n1, int n2, int ics, int isgn, float **a, float *t, int *ip, float *w); #endif /* USE_FFT2D_THREADS */ void cdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, itnull, nthread, nt, i; n = n1 << 1; if (n < n2) { n = n2; } if (n > (ip[0] << 2)) { makewt(n >> 2, ip, w); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 8 * nthread * n1; if (n2 == 4 * nthread) { nt >>= 1; } else if (n2 < 4 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { xdft2d0_subth(n1, n2, 0, isgn, a, ip, w); cdft2d_subth(n1, n2, isgn, a, t, ip, w); } else #endif /* USE_FFT2D_THREADS */ { for (i = 0; i < n1; i++) { cdft(n2, isgn, a[i], ip, w); } cdft2d_sub(n1, n2, isgn, a, t, ip, w); } if (itnull != 0) { free(t); } } void rdft2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, nw, nc, itnull, nthread, nt, i; n = n1 << 1; if (n < n2) { n = n2; } nw = ip[0]; if (n > (nw << 2)) { nw = n >> 2; makewt(nw, ip, w); } nc = ip[1]; if (n2 > (nc << 2)) { nc = n2 >> 2; makect(nc, ip, w + nw); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 8 * nthread * n1; if (n2 == 4 * nthread) { nt >>= 1; } else if (n2 < 4 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { if (isgn < 0) { rdft2d_sub(n1, isgn, a); cdft2d_subth(n1, n2, isgn, a, t, ip, w); } xdft2d0_subth(n1, n2, 1, isgn, a, ip, w); if (isgn >= 0) { cdft2d_subth(n1, n2, isgn, a, t, ip, w); rdft2d_sub(n1, isgn, a); } } else #endif /* USE_FFT2D_THREADS */ { if (isgn < 0) { rdft2d_sub(n1, isgn, a); cdft2d_sub(n1, n2, isgn, a, t, ip, w); } for (i = 0; i < n1; i++) { rdft(n2, isgn, a[i], ip, w); } if (isgn >= 0) { cdft2d_sub(n1, n2, isgn, a, t, ip, w); rdft2d_sub(n1, isgn, a); } } if (itnull != 0) { free(t); } } void rdft2dsort(int n1, int n2, int isgn, float** a) { int n1h, i; float x, y; n1h = n1 >> 1; if (isgn < 0) { for (i = n1h + 1; i < n1; i++) { a[i][0] = a[i][n2 + 1]; a[i][1] = a[i][n2]; } a[0][1] = a[0][n2]; a[n1h][1] = a[n1h][n2]; } else { for (i = n1h + 1; i < n1; i++) { y = a[i][0]; x = a[i][1]; a[i][n2] = x; a[i][n2 + 1] = y; a[n1 - i][n2] = x; a[n1 - i][n2 + 1] = -y; a[i][0] = a[n1 - i][0]; a[i][1] = -a[n1 - i][1]; } a[0][n2] = a[0][1]; a[0][n2 + 1] = 0; a[0][1] = 0; a[n1h][n2] = a[n1h][1]; a[n1h][n2 + 1] = 0; a[n1h][1] = 0; } } void ddcst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, nw, nc, itnull, nthread, nt, i; n = n1; if (n < n2) { n = n2; } nw = ip[0]; if (n > (nw << 2)) { nw = n >> 2; makewt(nw, ip, w); } nc = ip[1]; if (n > nc) { nc = n; makect(nc, ip, w + nw); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 4 * nthread * n1; if (n2 == 2 * nthread) { nt >>= 1; } else if (n2 < 2 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w); ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w); } else #endif /* USE_FFT2D_THREADS */ { for (i = 0; i < n1; i++) { ddst(n2, isgn, a[i], ip, w); } ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w); } if (itnull != 0) { free(t); } } void ddsct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, nw, nc, itnull, nthread, nt, i; n = n1; if (n < n2) { n = n2; } nw = ip[0]; if (n > (nw << 2)) { nw = n >> 2; makewt(nw, ip, w); } nc = ip[1]; if (n > nc) { nc = n; makect(nc, ip, w + nw); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 4 * nthread * n1; if (n2 == 2 * nthread) { nt >>= 1; } else if (n2 < 2 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w); ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w); } else #endif /* USE_FFT2D_THREADS */ { for (i = 0; i < n1; i++) { ddct(n2, isgn, a[i], ip, w); } ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w); } if (itnull != 0) { free(t); } } void ddct2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, nw, nc, itnull, nthread, nt, i; n = n1; if (n < n2) { n = n2; } nw = ip[0]; if (n > (nw << 2)) { nw = n >> 2; makewt(nw, ip, w); } nc = ip[1]; if (n > nc) { nc = n; makect(nc, ip, w + nw); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 4 * nthread * n1; if (n2 == 2 * nthread) { nt >>= 1; } else if (n2 < 2 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { ddxt2d0_subth(n1, n2, 0, isgn, a, ip, w); ddxt2d_subth(n1, n2, 0, isgn, a, t, ip, w); } else #endif /* USE_FFT2D_THREADS */ { for (i = 0; i < n1; i++) { ddct(n2, isgn, a[i], ip, w); } ddxt2d_sub(n1, n2, 0, isgn, a, t, ip, w); } if (itnull != 0) { free(t); } } void ddst2d(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int n, nw, nc, itnull, nthread, nt, i; n = n1; if (n < n2) { n = n2; } nw = ip[0]; if (n > (nw << 2)) { nw = n >> 2; makewt(nw, ip, w); } nc = ip[1]; if (n > nc) { nc = n; makect(nc, ip, w + nw); } itnull = 0; if (t == NULL) { itnull = 1; nthread = 1; #ifdef USE_FFT2D_THREADS nthread = FFT2D_MAX_THREADS; #endif /* USE_FFT2D_THREADS */ nt = 4 * nthread * n1; if (n2 == 2 * nthread) { nt >>= 1; } else if (n2 < 2 * nthread) { nt >>= 2; } t = (float*) malloc(sizeof(float) * nt); fft2d_alloc_error_check(t); } #ifdef USE_FFT2D_THREADS if ((float) n1 * n2 >= (float) FFT2D_THREADS_BEGIN_N) { ddxt2d0_subth(n1, n2, 1, isgn, a, ip, w); ddxt2d_subth(n1, n2, 1, isgn, a, t, ip, w); } else #endif /* USE_FFT2D_THREADS */ { for (i = 0; i < n1; i++) { ddst(n2, isgn, a[i], ip, w); } ddxt2d_sub(n1, n2, 1, isgn, a, t, ip, w); } if (itnull != 0) { free(t); } } /* -------- child routines -------- */ void cdft2d_sub(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { int i, j; if (n2 > 4) { for (j = 0; j < n2; j += 8) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][j]; t[2 * i + 1] = a[i][j + 1]; t[2 * n1 + 2 * i] = a[i][j + 2]; t[2 * n1 + 2 * i + 1] = a[i][j + 3]; t[4 * n1 + 2 * i] = a[i][j + 4]; t[4 * n1 + 2 * i + 1] = a[i][j + 5]; t[6 * n1 + 2 * i] = a[i][j + 6]; t[6 * n1 + 2 * i + 1] = a[i][j + 7]; } cdft(2 * n1, isgn, t, ip, w); cdft(2 * n1, isgn, &t[2 * n1], ip, w); cdft(2 * n1, isgn, &t[4 * n1], ip, w); cdft(2 * n1, isgn, &t[6 * n1], ip, w); for (i = 0; i < n1; i++) { a[i][j] = t[2 * i]; a[i][j + 1] = t[2 * i + 1]; a[i][j + 2] = t[2 * n1 + 2 * i]; a[i][j + 3] = t[2 * n1 + 2 * i + 1]; a[i][j + 4] = t[4 * n1 + 2 * i]; a[i][j + 5] = t[4 * n1 + 2 * i + 1]; a[i][j + 6] = t[6 * n1 + 2 * i]; a[i][j + 7] = t[6 * n1 + 2 * i + 1]; } } } else if (n2 == 4) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][0]; t[2 * i + 1] = a[i][1]; t[2 * n1 + 2 * i] = a[i][2]; t[2 * n1 + 2 * i + 1] = a[i][3]; } cdft(2 * n1, isgn, t, ip, w); cdft(2 * n1, isgn, &t[2 * n1], ip, w); for (i = 0; i < n1; i++) { a[i][0] = t[2 * i]; a[i][1] = t[2 * i + 1]; a[i][2] = t[2 * n1 + 2 * i]; a[i][3] = t[2 * n1 + 2 * i + 1]; } } else if (n2 == 2) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][0]; t[2 * i + 1] = a[i][1]; } cdft(2 * n1, isgn, t, ip, w); for (i = 0; i < n1; i++) { a[i][0] = t[2 * i]; a[i][1] = t[2 * i + 1]; } } } void rdft2d_sub(int n1, int isgn, float** a) { int n1h, i, j; float xi; n1h = n1 >> 1; if (isgn < 0) { for (i = 1; i < n1h; i++) { j = n1 - i; xi = a[i][0] - a[j][0]; a[i][0] += a[j][0]; a[j][0] = xi; xi = a[j][1] - a[i][1]; a[i][1] += a[j][1]; a[j][1] = xi; } } else { for (i = 1; i < n1h; i++) { j = n1 - i; a[j][0] = 0.5 * (a[i][0] - a[j][0]); a[i][0] -= a[j][0]; a[j][1] = 0.5 * (a[i][1] + a[j][1]); a[i][1] -= a[j][1]; } } } void ddxt2d_sub(int n1, int n2, int ics, int isgn, float** a, float* t, int* ip, float* w) { int i, j; if (n2 > 2) { for (j = 0; j < n2; j += 4) { for (i = 0; i < n1; i++) { t[i] = a[i][j]; t[n1 + i] = a[i][j + 1]; t[2 * n1 + i] = a[i][j + 2]; t[3 * n1 + i] = a[i][j + 3]; } if (ics == 0) { ddct(n1, isgn, t, ip, w); ddct(n1, isgn, &t[n1], ip, w); ddct(n1, isgn, &t[2 * n1], ip, w); ddct(n1, isgn, &t[3 * n1], ip, w); } else { ddst(n1, isgn, t, ip, w); ddst(n1, isgn, &t[n1], ip, w); ddst(n1, isgn, &t[2 * n1], ip, w); ddst(n1, isgn, &t[3 * n1], ip, w); } for (i = 0; i < n1; i++) { a[i][j] = t[i]; a[i][j + 1] = t[n1 + i]; a[i][j + 2] = t[2 * n1 + i]; a[i][j + 3] = t[3 * n1 + i]; } } } else if (n2 == 2) { for (i = 0; i < n1; i++) { t[i] = a[i][0]; t[n1 + i] = a[i][1]; } if (ics == 0) { ddct(n1, isgn, t, ip, w); ddct(n1, isgn, &t[n1], ip, w); } else { ddst(n1, isgn, t, ip, w); ddst(n1, isgn, &t[n1], ip, w); } for (i = 0; i < n1; i++) { a[i][0] = t[i]; a[i][1] = t[n1 + i]; } } } #ifdef USE_FFT2D_THREADS struct fft2d_arg_st { int nthread; int n0; int n1; int n2; int ic; int isgn; float** a; float* t; int* ip; float* w; }; typedef struct fft2d_arg_st fft2d_arg_t; void xdft2d0_subth(int n1, int n2, int icr, int isgn, float** a, int* ip, float* w) { void* xdft2d0_th(void* p); fft2d_thread_t th[FFT2D_MAX_THREADS]; fft2d_arg_t ag[FFT2D_MAX_THREADS]; int nthread, i; nthread = FFT2D_MAX_THREADS; if (nthread > n1) { nthread = n1; } for (i = 0; i < nthread; i++) { ag[i].nthread = nthread; ag[i].n0 = i; ag[i].n1 = n1; ag[i].n2 = n2; ag[i].ic = icr; ag[i].isgn = isgn; ag[i].a = a; ag[i].ip = ip; ag[i].w = w; fft2d_thread_create(&th[i], xdft2d0_th, &ag[i]); } for (i = 0; i < nthread; i++) { fft2d_thread_wait(th[i]); } } void cdft2d_subth(int n1, int n2, int isgn, float** a, float* t, int* ip, float* w) { fft2d_thread_t th[FFT2D_MAX_THREADS]; fft2d_arg_t ag[FFT2D_MAX_THREADS]; int nthread, nt, i; nthread = FFT2D_MAX_THREADS; nt = 8 * n1; if (n2 == 4 * FFT2D_MAX_THREADS) { nt >>= 1; } else if (n2 < 4 * FFT2D_MAX_THREADS) { nthread = n2 >> 1; nt >>= 2; } for (i = 0; i < nthread; i++) { ag[i].nthread = nthread; ag[i].n0 = i; ag[i].n1 = n1; ag[i].n2 = n2; ag[i].isgn = isgn; ag[i].a = a; ag[i].t = &t[nt * i]; ag[i].ip = ip; ag[i].w = w; fft2d_thread_create(&th[i], cdft2d_th, &ag[i]); } for (i = 0; i < nthread; i++) { fft2d_thread_wait(th[i]); } } void ddxt2d0_subth(int n1, int n2, int ics, int isgn, float** a, int* ip, float* w) { fft2d_thread_t th[FFT2D_MAX_THREADS]; fft2d_arg_t ag[FFT2D_MAX_THREADS]; int nthread, i; nthread = FFT2D_MAX_THREADS; if (nthread > n1) { nthread = n1; } for (i = 0; i < nthread; i++) { ag[i].nthread = nthread; ag[i].n0 = i; ag[i].n1 = n1; ag[i].n2 = n2; ag[i].ic = ics; ag[i].isgn = isgn; ag[i].a = a; ag[i].ip = ip; ag[i].w = w; fft2d_thread_create(&th[i], ddxt2d0_th, &ag[i]); } for (i = 0; i < nthread; i++) { fft2d_thread_wait(th[i]); } } void ddxt2d_subth(int n1, int n2, int ics, int isgn, float** a, float* t, int* ip, float* w) { fft2d_thread_t th[FFT2D_MAX_THREADS]; fft2d_arg_t ag[FFT2D_MAX_THREADS]; int nthread, nt, i; nthread = FFT2D_MAX_THREADS; nt = 4 * n1; if (n2 == 2 * FFT2D_MAX_THREADS) { nt >>= 1; } else if (n2 < 2 * FFT2D_MAX_THREADS) { nthread = n2; nt >>= 2; } for (i = 0; i < nthread; i++) { ag[i].nthread = nthread; ag[i].n0 = i; ag[i].n1 = n1; ag[i].n2 = n2; ag[i].ic = ics; ag[i].isgn = isgn; ag[i].a = a; ag[i].t = &t[nt * i]; ag[i].ip = ip; ag[i].w = w; fft2d_thread_create(&th[i], ddxt2d_th, &ag[i]); } for (i = 0; i < nthread; i++) { fft2d_thread_wait(th[i]); } } void* xdft2d0_th(void* p) { int nthread, n0, n1, n2, icr, isgn, *ip, i; float **a, *w; nthread = ((fft2d_arg_t*) p)->nthread; n0 = ((fft2d_arg_t*) p)->n0; n1 = ((fft2d_arg_t*) p)->n1; n2 = ((fft2d_arg_t*) p)->n2; icr = ((fft2d_arg_t*) p)->ic; isgn = ((fft2d_arg_t*) p)->isgn; a = ((fft2d_arg_t*) p)->a; ip = ((fft2d_arg_t*) p)->ip; w = ((fft2d_arg_t*) p)->w; if (icr == 0) { for (i = n0; i < n1; i += nthread) { cdft(n2, isgn, a[i], ip, w); } } else { for (i = n0; i < n1; i += nthread) { rdft(n2, isgn, a[i], ip, w); } } return (void*) 0; } void* cdft2d_th(void* p) { int nthread, n0, n1, n2, isgn, *ip, i, j; float **a, *t, *w; nthread = ((fft2d_arg_t*) p)->nthread; n0 = ((fft2d_arg_t*) p)->n0; n1 = ((fft2d_arg_t*) p)->n1; n2 = ((fft2d_arg_t*) p)->n2; isgn = ((fft2d_arg_t*) p)->isgn; a = ((fft2d_arg_t*) p)->a; t = ((fft2d_arg_t*) p)->t; ip = ((fft2d_arg_t*) p)->ip; w = ((fft2d_arg_t*) p)->w; if (n2 > 4 * nthread) { for (j = 8 * n0; j < n2; j += 8 * nthread) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][j]; t[2 * i + 1] = a[i][j + 1]; t[2 * n1 + 2 * i] = a[i][j + 2]; t[2 * n1 + 2 * i + 1] = a[i][j + 3]; t[4 * n1 + 2 * i] = a[i][j + 4]; t[4 * n1 + 2 * i + 1] = a[i][j + 5]; t[6 * n1 + 2 * i] = a[i][j + 6]; t[6 * n1 + 2 * i + 1] = a[i][j + 7]; } cdft(2 * n1, isgn, t, ip, w); cdft(2 * n1, isgn, &t[2 * n1], ip, w); cdft(2 * n1, isgn, &t[4 * n1], ip, w); cdft(2 * n1, isgn, &t[6 * n1], ip, w); for (i = 0; i < n1; i++) { a[i][j] = t[2 * i]; a[i][j + 1] = t[2 * i + 1]; a[i][j + 2] = t[2 * n1 + 2 * i]; a[i][j + 3] = t[2 * n1 + 2 * i + 1]; a[i][j + 4] = t[4 * n1 + 2 * i]; a[i][j + 5] = t[4 * n1 + 2 * i + 1]; a[i][j + 6] = t[6 * n1 + 2 * i]; a[i][j + 7] = t[6 * n1 + 2 * i + 1]; } } } else if (n2 == 4 * nthread) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][4 * n0]; t[2 * i + 1] = a[i][4 * n0 + 1]; t[2 * n1 + 2 * i] = a[i][4 * n0 + 2]; t[2 * n1 + 2 * i + 1] = a[i][4 * n0 + 3]; } cdft(2 * n1, isgn, t, ip, w); cdft(2 * n1, isgn, &t[2 * n1], ip, w); for (i = 0; i < n1; i++) { a[i][4 * n0] = t[2 * i]; a[i][4 * n0 + 1] = t[2 * i + 1]; a[i][4 * n0 + 2] = t[2 * n1 + 2 * i]; a[i][4 * n0 + 3] = t[2 * n1 + 2 * i + 1]; } } else if (n2 == 2 * nthread) { for (i = 0; i < n1; i++) { t[2 * i] = a[i][2 * n0]; t[2 * i + 1] = a[i][2 * n0 + 1]; } cdft(2 * n1, isgn, t, ip, w); for (i = 0; i < n1; i++) { a[i][2 * n0] = t[2 * i]; a[i][2 * n0 + 1] = t[2 * i + 1]; } } return (void*) 0; } void* ddxt2d0_th(void* p) { int nthread, n0, n1, n2, ics, isgn, *ip, i; float **a, *w; nthread = ((fft2d_arg_t*) p)->nthread; n0 = ((fft2d_arg_t*) p)->n0; n1 = ((fft2d_arg_t*) p)->n1; n2 = ((fft2d_arg_t*) p)->n2; ics = ((fft2d_arg_t*) p)->ic; isgn = ((fft2d_arg_t*) p)->isgn; a = ((fft2d_arg_t*) p)->a; ip = ((fft2d_arg_t*) p)->ip; w = ((fft2d_arg_t*) p)->w; if (ics == 0) { for (i = n0; i < n1; i += nthread) { ddct(n2, isgn, a[i], ip, w); } } else { for (i = n0; i < n1; i += nthread) { ddst(n2, isgn, a[i], ip, w); } } return (void*) 0; } void* ddxt2d_th(void* p) { int nthread, n0, n1, n2, ics, isgn, *ip, i, j; float **a, *t, *w; nthread = ((fft2d_arg_t*) p)->nthread; n0 = ((fft2d_arg_t*) p)->n0; n1 = ((fft2d_arg_t*) p)->n1; n2 = ((fft2d_arg_t*) p)->n2; ics = ((fft2d_arg_t*) p)->ic; isgn = ((fft2d_arg_t*) p)->isgn; a = ((fft2d_arg_t*) p)->a; t = ((fft2d_arg_t*) p)->t; ip = ((fft2d_arg_t*) p)->ip; w = ((fft2d_arg_t*) p)->w; if (n2 > 2 * nthread) { for (j = 4 * n0; j < n2; j += 4 * nthread) { for (i = 0; i < n1; i++) { t[i] = a[i][j]; t[n1 + i] = a[i][j + 1]; t[2 * n1 + i] = a[i][j + 2]; t[3 * n1 + i] = a[i][j + 3]; } if (ics == 0) { ddct(n1, isgn, t, ip, w); ddct(n1, isgn, &t[n1], ip, w); ddct(n1, isgn, &t[2 * n1], ip, w); ddct(n1, isgn, &t[3 * n1], ip, w); } else { ddst(n1, isgn, t, ip, w); ddst(n1, isgn, &t[n1], ip, w); ddst(n1, isgn, &t[2 * n1], ip, w); ddst(n1, isgn, &t[3 * n1], ip, w); } for (i = 0; i < n1; i++) { a[i][j] = t[i]; a[i][j + 1] = t[n1 + i]; a[i][j + 2] = t[2 * n1 + i]; a[i][j + 3] = t[3 * n1 + i]; } } } else if (n2 == 2 * nthread) { for (i = 0; i < n1; i++) { t[i] = a[i][2 * n0]; t[n1 + i] = a[i][2 * n0 + 1]; } if (ics == 0) { ddct(n1, isgn, t, ip, w); ddct(n1, isgn, &t[n1], ip, w); } else { ddst(n1, isgn, t, ip, w); ddst(n1, isgn, &t[n1], ip, w); } for (i = 0; i < n1; i++) { a[i][2 * n0] = t[i]; a[i][2 * n0 + 1] = t[n1 + i]; } } else if (n2 == nthread) { for (i = 0; i < n1; i++) { t[i] = a[i][n0]; } if (ics == 0) { ddct(n1, isgn, t, ip, w); } else { ddst(n1, isgn, t, ip, w); } for (i = 0; i < n1; i++) { a[i][n0] = t[i]; } } return (void*) 0; } #endif /* USE_FFT2D_THREADS */ NEXTPNR_NAMESPACE_END