1/*
  2Copyright (c) 2003-2010, Mark Borgerding
  3
  4All rights reserved.
  5
  6Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
  7
  8    * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
  9    * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution.
 10    * Neither the author nor the names of any contributors may be used to endorse or promote products derived from this software without specific prior written permission.
 11
 12THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 13*/
 14
 15
 16#include "_kiss_fft_guts.h"
 17/* The guts header contains all the multiplication and addition macros that are defined for
 18 fixed or floating point complex numbers.  It also delares the kf_ internal functions.
 19 */
 20
 21static void kf_bfly2(
 22        kiss_fft_cpx * Fout,
 23        const size_t fstride,
 24        const kiss_fft_cfg st,
 25        int m
 26        )
 27{
 28    kiss_fft_cpx * Fout2;
 29    kiss_fft_cpx * tw1 = st->twiddles;
 30    kiss_fft_cpx t;
 31    Fout2 = Fout + m;
 32    do{
 33        C_FIXDIV(*Fout,2); C_FIXDIV(*Fout2,2);
 34
 35        C_MUL (t,  *Fout2 , *tw1);
 36        tw1 += fstride;
 37        C_SUB( *Fout2 ,  *Fout , t );
 38        C_ADDTO( *Fout ,  t );
 39        ++Fout2;
 40        ++Fout;
 41    }while (--m);
 42}
 43
 44static void kf_bfly4(
 45        kiss_fft_cpx * Fout,
 46        const size_t fstride,
 47        const kiss_fft_cfg st,
 48        const size_t m
 49        )
 50{
 51    kiss_fft_cpx *tw1,*tw2,*tw3;
 52    kiss_fft_cpx scratch[6];
 53    size_t k=m;
 54    const size_t m2=2*m;
 55    const size_t m3=3*m;
 56
 57
 58    tw3 = tw2 = tw1 = st->twiddles;
 59
 60    do {
 61        C_FIXDIV(*Fout,4); C_FIXDIV(Fout[m],4); C_FIXDIV(Fout[m2],4); C_FIXDIV(Fout[m3],4);
 62
 63        C_MUL(scratch[0],Fout[m] , *tw1 );
 64        C_MUL(scratch[1],Fout[m2] , *tw2 );
 65        C_MUL(scratch[2],Fout[m3] , *tw3 );
 66
 67        C_SUB( scratch[5] , *Fout, scratch[1] );
 68        C_ADDTO(*Fout, scratch[1]);
 69        C_ADD( scratch[3] , scratch[0] , scratch[2] );
 70        C_SUB( scratch[4] , scratch[0] , scratch[2] );
 71        C_SUB( Fout[m2], *Fout, scratch[3] );
 72        tw1 += fstride;
 73        tw2 += fstride*2;
 74        tw3 += fstride*3;
 75        C_ADDTO( *Fout , scratch[3] );
 76
 77        if(st->inverse) {
 78            Fout[m].r = scratch[5].r - scratch[4].i;
 79            Fout[m].i = scratch[5].i + scratch[4].r;
 80            Fout[m3].r = scratch[5].r + scratch[4].i;
 81            Fout[m3].i = scratch[5].i - scratch[4].r;
 82        }else{
 83            Fout[m].r = scratch[5].r + scratch[4].i;
 84            Fout[m].i = scratch[5].i - scratch[4].r;
 85            Fout[m3].r = scratch[5].r - scratch[4].i;
 86            Fout[m3].i = scratch[5].i + scratch[4].r;
 87        }
 88        ++Fout;
 89    }while(--k);
 90}
 91
 92static void kf_bfly3(
 93         kiss_fft_cpx * Fout,
 94         const size_t fstride,
 95         const kiss_fft_cfg st,
 96         size_t m
 97         )
 98{
 99     size_t k=m;
100     const size_t m2 = 2*m;
101     kiss_fft_cpx *tw1,*tw2;
102     kiss_fft_cpx scratch[5];
103     kiss_fft_cpx epi3;
104     epi3 = st->twiddles[fstride*m];
105
106     tw1=tw2=st->twiddles;
107
108     do{
109         C_FIXDIV(*Fout,3); C_FIXDIV(Fout[m],3); C_FIXDIV(Fout[m2],3);
110
111         C_MUL(scratch[1],Fout[m] , *tw1);
112         C_MUL(scratch[2],Fout[m2] , *tw2);
113
114         C_ADD(scratch[3],scratch[1],scratch[2]);
115         C_SUB(scratch[0],scratch[1],scratch[2]);
116         tw1 += fstride;
117         tw2 += fstride*2;
118
119         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
120         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
121
122         C_MULBYSCALAR( scratch[0] , epi3.i );
123
124         C_ADDTO(*Fout,scratch[3]);
125
126         Fout[m2].r = Fout[m].r + scratch[0].i;
127         Fout[m2].i = Fout[m].i - scratch[0].r;
128
129         Fout[m].r -= scratch[0].i;
130         Fout[m].i += scratch[0].r;
131
132         ++Fout;
133     }while(--k);
134}
135
136static void kf_bfly5(
137        kiss_fft_cpx * Fout,
138        const size_t fstride,
139        const kiss_fft_cfg st,
140        int m
141        )
142{
143    kiss_fft_cpx *Fout0,*Fout1,*Fout2,*Fout3,*Fout4;
144    int u;
145    kiss_fft_cpx scratch[13];
146    kiss_fft_cpx * twiddles = st->twiddles;
147    kiss_fft_cpx *tw;
148    kiss_fft_cpx ya,yb;
149    ya = twiddles[fstride*m];
150    yb = twiddles[fstride*2*m];
151
152    Fout0=Fout;
153    Fout1=Fout0+m;
154    Fout2=Fout0+2*m;
155    Fout3=Fout0+3*m;
156    Fout4=Fout0+4*m;
157
158    tw=st->twiddles;
159    for ( u=0; u<m; ++u ) {
160        C_FIXDIV( *Fout0,5); C_FIXDIV( *Fout1,5); C_FIXDIV( *Fout2,5); C_FIXDIV( *Fout3,5); C_FIXDIV( *Fout4,5);
161        scratch[0] = *Fout0;
162
163        C_MUL(scratch[1] ,*Fout1, tw[u*fstride]);
164        C_MUL(scratch[2] ,*Fout2, tw[2*u*fstride]);
165        C_MUL(scratch[3] ,*Fout3, tw[3*u*fstride]);
166        C_MUL(scratch[4] ,*Fout4, tw[4*u*fstride]);
167
168        C_ADD( scratch[7],scratch[1],scratch[4]);
169        C_SUB( scratch[10],scratch[1],scratch[4]);
170        C_ADD( scratch[8],scratch[2],scratch[3]);
171        C_SUB( scratch[9],scratch[2],scratch[3]);
172
173        Fout0->r += scratch[7].r + scratch[8].r;
174        Fout0->i += scratch[7].i + scratch[8].i;
175
176        scratch[5].r = scratch[0].r + S_MUL(scratch[7].r,ya.r) + S_MUL(scratch[8].r,yb.r);
177        scratch[5].i = scratch[0].i + S_MUL(scratch[7].i,ya.r) + S_MUL(scratch[8].i,yb.r);
178
179        scratch[6].r =  S_MUL(scratch[10].i,ya.i) + S_MUL(scratch[9].i,yb.i);
180        scratch[6].i = -S_MUL(scratch[10].r,ya.i) - S_MUL(scratch[9].r,yb.i);
181
182        C_SUB(*Fout1,scratch[5],scratch[6]);
183        C_ADD(*Fout4,scratch[5],scratch[6]);
184
185        scratch[11].r = scratch[0].r + S_MUL(scratch[7].r,yb.r) + S_MUL(scratch[8].r,ya.r);
186        scratch[11].i = scratch[0].i + S_MUL(scratch[7].i,yb.r) + S_MUL(scratch[8].i,ya.r);
187        scratch[12].r = - S_MUL(scratch[10].i,yb.i) + S_MUL(scratch[9].i,ya.i);
188        scratch[12].i = S_MUL(scratch[10].r,yb.i) - S_MUL(scratch[9].r,ya.i);
189
190        C_ADD(*Fout2,scratch[11],scratch[12]);
191        C_SUB(*Fout3,scratch[11],scratch[12]);
192
193        ++Fout0;++Fout1;++Fout2;++Fout3;++Fout4;
194    }
195}
196
197/* perform the butterfly for one stage of a mixed radix FFT */
198static void kf_bfly_generic(
199        kiss_fft_cpx * Fout,
200        const size_t fstride,
201        const kiss_fft_cfg st,
202        int m,
203        int p
204        )
205{
206    int u,k,q1,q;
207    kiss_fft_cpx * twiddles = st->twiddles;
208    kiss_fft_cpx t;
209    int Norig = st->nfft;
210
211    kiss_fft_cpx * scratch = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC(sizeof(kiss_fft_cpx)*p);
212
213    for ( u=0; u<m; ++u ) {
214        k=u;
215        for ( q1=0 ; q1<p ; ++q1 ) {
216            scratch[q1] = Fout[ k  ];
217            C_FIXDIV(scratch[q1],p);
218            k += m;
219        }
220
221        k=u;
222        for ( q1=0 ; q1<p ; ++q1 ) {
223            int twidx=0;
224            Fout[ k ] = scratch[0];
225            for (q=1;q<p;++q ) {
226                twidx += fstride * k;
227                if (twidx>=Norig) twidx-=Norig;
228                C_MUL(t,scratch[q] , twiddles[twidx] );
229                C_ADDTO( Fout[ k ] ,t);
230            }
231            k += m;
232        }
233    }
234    KISS_FFT_TMP_FREE(scratch);
235}
236
237static
238void kf_work(
239        kiss_fft_cpx * Fout,
240        const kiss_fft_cpx * f,
241        const size_t fstride,
242        int in_stride,
243        int * factors,
244        const kiss_fft_cfg st
245        )
246{
247    kiss_fft_cpx * Fout_beg=Fout;
248    const int p=*factors++; /* the radix  */
249    const int m=*factors++; /* stage's fft length/p */
250    const kiss_fft_cpx * Fout_end = Fout + p*m;
251
252#ifdef _OPENMP
253    // use openmp extensions at the 
254    // top-level (not recursive)
255    if (fstride==1 && p<=5)
256    {
257        int k;
258
259        // execute the p different work units in different threads
260#       pragma omp parallel for
261        for (k=0;k<p;++k) 
262            kf_work( Fout +k*m, f+ fstride*in_stride*k,fstride*p,in_stride,factors,st);
263        // all threads have joined by this point
264
265        switch (p) {
266            case 2: kf_bfly2(Fout,fstride,st,m); break;
267            case 3: kf_bfly3(Fout,fstride,st,m); break; 
268            case 4: kf_bfly4(Fout,fstride,st,m); break;
269            case 5: kf_bfly5(Fout,fstride,st,m); break; 
270            default: kf_bfly_generic(Fout,fstride,st,m,p); break;
271        }
272        return;
273    }
274#endif
275
276    if (m==1) {
277        do{
278            *Fout = *f;
279            f += fstride*in_stride;
280        }while(++Fout != Fout_end );
281    }else{
282        do{
283            // recursive call:
284            // DFT of size m*p performed by doing
285            // p instances of smaller DFTs of size m, 
286            // each one takes a decimated version of the input
287            kf_work( Fout , f, fstride*p, in_stride, factors,st);
288            f += fstride*in_stride;
289        }while( (Fout += m) != Fout_end );
290    }
291
292    Fout=Fout_beg;
293
294    // recombine the p smaller DFTs 
295    switch (p) {
296        case 2: kf_bfly2(Fout,fstride,st,m); break;
297        case 3: kf_bfly3(Fout,fstride,st,m); break; 
298        case 4: kf_bfly4(Fout,fstride,st,m); break;
299        case 5: kf_bfly5(Fout,fstride,st,m); break; 
300        default: kf_bfly_generic(Fout,fstride,st,m,p); break;
301    }
302}
303
304/*  facbuf is populated by p1,m1,p2,m2, ...
305    where 
306    p[i] * m[i] = m[i-1]
307    m0 = n                  */
308static 
309void kf_factor(int n,int * facbuf)
310{
311    int p=4;
312    double floor_sqrt;
313    floor_sqrt = floor( sqrt((double)n) );
314
315    /*factor out powers of 4, powers of 2, then any remaining primes */
316    do {
317        while (n % p) {
318            switch (p) {
319                case 4: p = 2; break;
320                case 2: p = 3; break;
321                default: p += 2; break;
322            }
323            if (p > floor_sqrt)
324                p = n;          /* no more factors, skip to end */
325        }
326        n /= p;
327        *facbuf++ = p;
328        *facbuf++ = n;
329    } while (n > 1);
330}
331
332/*
333 *
334 * User-callable function to allocate all necessary storage space for the fft.
335 *
336 * The return value is a contiguous block of memory, allocated with malloc.  As such,
337 * It can be freed with free(), rather than a kiss_fft-specific function.
338 * */
339kiss_fft_cfg kiss_fft_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem )
340{
341    kiss_fft_cfg st=NULL;
342    size_t memneeded = sizeof(struct kiss_fft_state)
343        + sizeof(kiss_fft_cpx)*(nfft-1); /* twiddle factors*/
344
345    if ( lenmem==NULL ) {
346        st = ( kiss_fft_cfg)KISS_FFT_MALLOC( memneeded );
347    }else{
348        if (mem != NULL && *lenmem >= memneeded)
349            st = (kiss_fft_cfg)mem;
350        *lenmem = memneeded;
351    }
352    if (st) {
353        int i;
354        st->nfft=nfft;
355        st->inverse = inverse_fft;
356
357        for (i=0;i<nfft;++i) {
358            const double pi=3.141592653589793238462643383279502884197169399375105820974944;
359            double phase = -2*pi*i / nfft;
360            if (st->inverse)
361                phase *= -1;
362            kf_cexp(st->twiddles+i, phase );
363        }
364
365        kf_factor(nfft,st->factors);
366    }
367    return st;
368}
369
370
371void kiss_fft_stride(kiss_fft_cfg st,const kiss_fft_cpx *fin,kiss_fft_cpx *fout,int in_stride)
372{
373    if (fin == fout) {
374        //NOTE: this is not really an in-place FFT algorithm.
375        //It just performs an out-of-place FFT into a temp buffer
376        kiss_fft_cpx * tmpbuf = (kiss_fft_cpx*)KISS_FFT_TMP_ALLOC( sizeof(kiss_fft_cpx)*st->nfft);
377        kf_work(tmpbuf,fin,1,in_stride, st->factors,st);
378        memcpy(fout,tmpbuf,sizeof(kiss_fft_cpx)*st->nfft);
379        KISS_FFT_TMP_FREE(tmpbuf);
380    }else{
381        kf_work( fout, fin, 1,in_stride, st->factors,st );
382    }
383}
384
385void kiss_fft(kiss_fft_cfg cfg,const kiss_fft_cpx *fin,kiss_fft_cpx *fout)
386{
387    kiss_fft_stride(cfg,fin,fout,1);
388}
389
390
391void kiss_fft_cleanup(void)
392{
393    // nothing needed any more
394}
395
396int kiss_fft_next_fast_size(int n)
397{
398    while(1) {
399        int m=n;
400        while ( (m%2) == 0 ) m/=2;
401        while ( (m%3) == 0 ) m/=3;
402        while ( (m%5) == 0 ) m/=5;
403        if (m<=1)
404            break; /* n is completely factorable by twos, threes, and fives */
405        n++;
406    }
407    return n;
408}