FFTCPU0.c
Revision as of 15:51, 14 September 2020 by imported>Bkavanagh (Created page with " /* ============================================================================ Name : FFTCPU0.c Author : Michael Version : Copyright : Copyright...")
/* ============================================================================ Name : FFTCPU0.c Author : Michael Version : Copyright : Copyright OCE Technology Description : Multicore FFT This program demonstrates the use of 1, 2, or 4 CPUs on a S698PM multi-core processor to calculate a Fast Fourier Transform (FFT). A selection of different input arrays is provided, additional input arrays can be created easily. At present the input array length must be a power of 2, and the scalar type must be C float. The functions used are defined in fft.h and documented there. Please consult fft.h for more information ============================================================================ */ #include <stdio.h> #include <stdlib.h> #include "fft.h" /* * main() * * Usage: * * Assign values to each of the variables below * * lenP2 the power of 2 that gives the length of the input array * (at present from 5 to 17, i.e. length 32 to 131,072) * * whichcpus the CPUs to use, from 1 to 0xf (binary xxxx, 1 = use) * * numruns number of test runs - timings are given for total runs * * type 0 for forward FFT, 1 for inverse FFT * * normalise 0 for not normalized, 1 for normalized * * arrange 0 if data already arranged, 1 if need to arrange * * realonly 0 for (in-phase, quadrature) input, 1 for (in-phase, all 0); * * displayLines number of lines of data to display * * Set up the complex data input in cin. * (a number of choices are already set up and commented out, it is simple to add more) * * Compile the program. * * Use the DMON script provided to load and run the program * This also sets up the CPU stacks and enables the L2 cache. * * N.B. L2 cache is disabled by default, * if not switched on timings are affected dramatically. * */ int main(int argc,char ** argv) { /* * Initial entry point for all CPUs -- DO NOT CHANGE */ if(START_CPU_INDEX != fftCpusStart()) // N.B. only START_CPU returns from this call { printf("\nAbandoning program, startup problem\n"); return 0; } // if /* * START HERE - select input size, CPUs to use, number of test runs */ int lenP2 = 12; // input array length is this power of 2 (from 5 to 17 at present) int whichcpus = 0xf; // CPUs to use, hex value xxxx, x = 1 to turn on, 0xf for all on const int numruns = 200; // number of test runs const int type = 0; // 0 for forward FFT, 1 for inverse FFT const int normalise = 0; // 0 for not normalized, 1 for normalized const int arrange = 1; // 0 for don't arrange, 1 for arrange const int realonly = 0; // 0 for (in-phase, quadrature), 1 for (in-phase,all 0); int displayLines = 64; // number of lines of the data to display, if enabled below // check input and correct if possible, give message and exit if settings incorrect if(settingsIncorrect(lenP2, &whichcpus, numruns, type, &displayLines)) return 0; /* * VARIOUS EXAMPLES * Example lengths depend on lenP2 above * The examples not in use are commented out * Other input choices can be set up as (in-phase, quadrature)in cin as desired * N.B. If realonly is not selected both components should be set up, * otherwise previous data may cause anomalous results. * If realonly is selected, the quadrature part of the input is overwritten * with 0s. */ fft_cpx * cin = (void *)FFT_CIN; // start of complex number input data array int length = 1 << lenP2; // length of complex numbers input array int i; // test: combination of frequencies and phases //for (i=0;i<length;++i) { // cin[i].re = 1.0 + sin(M_PI*i/(length >> 1)) // in-phase part // + 3*cos(M_PI*2*i/(length >> 1)) // + 2*cos(M_PI*6*i/(length >> 1)) // + 7*cos(M_PI*((length >> 1) - 1)*i/(length >> 1)) // + 5*cos(M_PI*i); // Nyquist // cin[i].im = sin(M_PI*3*i/(length >> 1)); // quadrature part // } // for // test: ramp // for (i=0;i<length;++i) { // cin[i].re = i; // cin[i].im = 0; // } // for // test: random // srand(time(0)); // for (i=0;i<length;++i) { // cin[i].re = rand(); // cin[i].im = 0; // } // for // square wave for (i=0;i<(length >> 1);++i) { cin[i].re = 3; cin[i].im = 0; } // for for (i=(length >> 1); i < length;++i) { cin[i].re = 0; cin[i].im = 0; } // for // show the input data // printf("Complex data:\n"); // showC(cin, displayLines, 0); /* * Do FFT test * Does numruns, first with one processor, then with the requested processors (whichcpus) */ // set up the twiddles array - its length is half that of the input array. fft_cpx * tw = (void *)FFT_TWIDDLES; // preassigned space in SRAM starting at this address fillTwiddles(tw,lenP2,type); // 0 for forward FFT, 1 for inverse FFT // if data is in-phase only, i.e. no quadrature component, pack into first half if(realonly) { fftRealOnlyArrange(cin,lenP2); lenP2 -= 1; // half the size, but leave value of length as is } // if // if need to arrange, set up mapping array - N.B. it is its own inverse if(arrange) { fftArrangeIndex((void *)FFT_MAP, lenP2); } // if // set up a pointer for rearranged input and clear it // if 'arrange' set result will be here fft_cpx * rain = (void *)FFT_RAIN; // preassigned space in SRAM starting at this address for(i = 0; i < length; i++) { rain[i].re = 0; rain[i].im = 0; } // int map[1 << lenP2]; // fftArrangeIndex(map, lenP2); // for (i=0;i<length;++i) { // rain[map[i]] = cin[i]; // } // for // start the cpus being used other than this one - they will halt waiting for work startCpus(whichcpus); // get ready to time clock_t start, finish; // start and finish times unsigned long single, multiple; // times with single, multiple C{Us printf("\n\nDoing %d runs of %d length FFT with 1 processor\n", numruns, length); start = clock(); for(i = 0; i < numruns; i++) { if(arrange) // will do rearrangement in parallel { // do it using this CPU, the startup CPU, usually CPU 0 fftMulti(cin, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly); } else { // first, rearrange the input into rain fftArrangeOut(cin, rain,lenP2); // do it using this CPU, the startup CPU, usually CPU 0 fftMulti(rain, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly); } // else } // for finish = clock(); single = finish - start; printf("\n\nDone %d runs of %d length FFT with 1 processor, time: %ld\n", numruns, length, single); int numcpus = getNumCpus(whichcpus); // how many CPUs being used printf("\nDoing %d runs of %d length FFT with %d processors\n", numruns, length, numcpus); start = clock(); for(i = 0; i < numruns; i++) { if(arrange) // will do rearrangement in parallel { // do it using selected cpus fftMulti(cin, lenP2, tw, whichcpus, normalise, arrange, realonly); } else { // first, rearrange the input fftArrangeOut(cin, rain, lenP2); fftMulti(rain, lenP2, tw, whichcpus, normalise, arrange, realonly); } // else } // for finish = clock(); multiple = finish - start; /* * FINISH - show FFT result and times */ printf("Complex FFT:\n"); show8C(rain, length, 0.001, realonly); printf("\n\n%d runs of %d length FFT with %d processor, time: %ld\n", numruns, length, 1, single); printf("\n%d runs of %d length FFT with %d processors, time: %ld\n", numruns, length, numcpus, multiple); if(realonly) { printf("\nSpeedup factor for %d real data with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple); } else { printf("\nSpeedup factor for %d (in-phase,quadrature) with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple); } // else return 0; } // main() // see fft.h for description static inline unsigned int get_asr17(void) { unsigned int reg; __asm__ (" mov %%asr17, %0 " : "=r"(reg) :); return reg; } // see fft.h for description int fftCpusStart(void) { volatile unsigned int status = 0; volatile fft_cpu *thisCpu; // find out which CPU is running int LEON3_Cpu_Index = (get_asr17() >> 28) & 3; // max 4 cpus, 0 .. 3 // set up reference to data for this CPU thisCpu = (void *)(FFT_FLAGS + LEON3_Cpu_Index*sizeof(fft_cpu)); // if start CPU (usually CPU0), initialize flags area if (START_CPU_INDEX == LEON3_Cpu_Index){ // START_CPU (usually 0) initializes settings and returns volatile fft_cpu *thisCpu; int i = 0; for(i = 0; i < MAX_CPUS; i++) { thisCpu = (void *)(FFT_FLAGS + i*sizeof(fft_cpu)); thisCpu->status = 0; thisCpu->cinadrs = 0; thisCpu->lenP2 = 0; thisCpu->begin = 0; thisCpu->nextbegin = 0; thisCpu->oddadd = 0; thisCpu->startP2 = 0; thisCpu->twadrs = 0; thisCpu->toffset = 0; thisCpu->tinc = 0; thisCpu->normalise = 0; thisCpu->arrange = 0; thisCpu->realonly = 0; thisCpu->spare1 = 0x01010101; thisCpu->spare2 = 0x10101010; } // for }else { // other CPUS loop forever, check for work, do it, set done status while(1) { thisCpu->spare1 = 0xcafe; // for debug status = thisCpu->status; if(FFT_START == (FFT_WORK_MASK & status)) { thisCpu->spare1 = 0xbabecafe; // for debug fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2, thisCpu->begin, thisCpu->nextbegin, thisCpu->oddadd, thisCpu->startP2, (void *)thisCpu->twadrs, thisCpu->toffset, thisCpu->tinc, thisCpu->normalise,thisCpu->arrange, thisCpu->realonly, thisCpu->secondpass); thisCpu->spare2 = 0xcafebabe; // for debug thisCpu->status &= ~FFT_WORK_MASK; // finished } // if // halt this CPU, will be restarted when work available __asm__ (" wr %g0, %asr19 "); } // while } // else return LEON3_Cpu_Index; // should only get here for CPU 0 } // fftCpusStart() // see fft.h for description void startCpus(const int whichcpus) { int i; int LEON3_Cpu_Index = (get_asr17() >> 28) & 3; // max 4 cpus, 0 .. 3 int this_cpu = 1 << LEON3_Cpu_Index; int current_cpu = LAST_CPU; // usually 8 while(current_cpu) { for(i=0; i < CPU_START_PAUSE; i++){} // pause if((current_cpu & whichcpus)&&(current_cpu != this_cpu)) { *((int *)IRQMP_ADRS) = 0x380b0000 | current_cpu; } // if current_cpu >>= 1; } // while for(i=0; i < CPU_START_PAUSE; i++){} // pause return; } // startCpus() /* * INPUT ARRAY REARRANGEMENT */ // see fft.h for description void fftArrangeOut(fft_cpx* cin, fft_cpx* result, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if int len = 1 << lenP2; int mask = len -1; int i = 0; for(i = 0; i < len; i++){ /* * Find the index to swap with. * - treating index as potentially 32 bits, * number of bits in use should be less */ int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // N.B. arithmetic shift v &= mask; if(v >= i) // otherwise already done { result[i] = cin[v]; result[v] = cin[i]; } // if } // for return; } // fftArrangeOut() // see fft.h for description void fftArrangeIn(fft_cpx* x, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // do it int len = 1 << lenP2; // power of 2 int mask = len - 1; fft_cpx temp; // for swap int i = 0; for(i = 0; i < len; i++){ // must check full array, not just half // Find the index to swap with // - treating index as no more than 32 bits, int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // arithmetic shift v &= mask; // kill off any leading 1s due to arithmetic shift if(v > i) // otherwise already done or same { temp = x[i]; x[i] = x[v]; x[v] = temp; } // if } // for return; } // fftArrangeIn() // see fft.h for description void fftArrangeIndex(int* mapindex, int lenP2) { // validation check if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // do it int len = 1 << lenP2; // power of 2 int mask = len - 1; int i = 0; for(i = 0; i < len; i++){ // must check full array, not just half // Find the index to swap with // - treating index as no more than 32 bits, int v = i; // swap odd and even bits v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); // swap consecutive pairs v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); // swap nibbles ... v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); // swap bytes v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); // swap 2-byte long pairs v = ( v >> 16 ) | ( v << 16); v >>= (32 - lenP2); // arithmetic shift v &= mask; // kill off any leading ones mapindex[i] = v; } // for return; } // fftArrangeIndex() // see fft.h for description void fftRealOnlyArrange(fft_cpx * cin, int lenP2) { int i,j; int halflen = 1 << (lenP2-1); for (i = 0; i < halflen; i++) { j = i << 1; cin[i].re = cin[j].re; cin[i].im = cin[j+1].re; } // for for (i=halflen; i < (1 << lenP2); i++) { cin[i].re = 0; cin[i].im = 0; } // for } // fftRealOnlyArrange() /* * TWIDDLES ARRAY */ // see fft.h for description void fillTwiddles(fft_cpx* twiddles, const int lenP2, const int type) { // validation checks if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfillTwiddles(): Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if if ((0 != type)&& (1 != type)) // invalid, should be forward(0) or inverse(1) { printf("\nInvalid type for twiddles array, should be 0 for forward, 1 for inverse, actual was: 0x%x\n", type); return; } // if /* * Do it - using doubles */ int n = 1 << lenP2; double twoPiDivNsigned; if(1 != type) // negative exponent for forward transform, positive for inverse { twoPiDivNsigned= -M_PI / (double)(n >> 1); // 2*Pi/n } else { twoPiDivNsigned = M_PI / (double)(n >> 1); // positive exponent for inverse transform } // else double root2Inv = 1.0/sqrt(2.0); // quicker than sine or cosine /* * Fill in the values */ int quarter; // will be N/4, quarter way round int eight; // will be N/8, eight of way around twiddles[0].re = 1.0; // always same twiddles[0].im = 0; if(1 != type) // forward FFT - N.B. using negative exponent { quarter = n >> 2; twiddles[quarter].re = 0; // quarter way around with negative exponent (0,-i) twiddles[quarter].im = -1; eight = quarter >> 1; // eight of way around (1/root2, -i*1/root2) twiddles[eight].re = root2Inv; twiddles[eight].im = -root2Inv; twiddles[quarter+eight].re = -root2Inv; // three eights of way around (-1/root2, -i*1/root2) twiddles[quarter+eight].im = -root2Inv; int j; for(j=1; j<eight; j++) { double theta = (double)j * twoPiDivNsigned; // negative twiddles[j].re = cos(theta); // positive twiddles[j].im = sin(theta); // negative int pos = quarter-j; twiddles[pos].re = -twiddles[j].im; twiddles[pos].im = -twiddles[j].re; pos = quarter+j; twiddles[pos].re = twiddles[j].im; twiddles[pos].im = -twiddles[j].re; pos = (quarter << 1) - j; twiddles[pos].re = -twiddles[j].re; twiddles[pos].im = twiddles[j].im; } // for } else { // inverse FFT - N.B. uses positive exponent quarter = n >> 2; twiddles[quarter].re = 0; // quarter way around (0,i) twiddles[quarter].im = 1; // N.B. positive exponent eight = quarter >> 1; // eight of way around (1/root2, i*1/root2) twiddles[eight].re = root2Inv; twiddles[eight].im = root2Inv; twiddles[quarter+eight].re = -root2Inv; // three eights of way around (-1/root2, i*1/root2) twiddles[quarter+eight].im = root2Inv; int j; for(j=1; j<eight; j++) { double theta = (double)j * twoPiDivNsigned; // positive twiddles[j].re = cos(theta); // positive twiddles[j].im = sin(theta); // positive int pos = quarter-j; twiddles[pos].re = twiddles[j].im; twiddles[pos].im = twiddles[j].re; pos = quarter+j; twiddles[pos].re = -twiddles[j].im; twiddles[pos].im = twiddles[j].re; pos = (quarter << 1) - j; twiddles[pos].re = -twiddles[j].re; twiddles[pos].im = twiddles[j].im; } // for } // else } // fillTwiddles() /* * FFT PROCESSING */ // see fft.h for description void fftProc(fft_cpx* cin, const int lenP2, const int start, const int nextstart, const int oddadd, const int startP2, const fft_cpx* twiddles, const int toffset, const int twinc, const int normalise, const int arrange, const int realonly, const int secondpass) { /* * Input validation */ if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfftProc():Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if int length = 1 << lenP2; int subarraylen = nextstart - start; if((0 > start)||(0 > nextstart)||(0 >= subarraylen)) { printf("\nInvalid input sub-array indices, start index: %d, next start: %d, sub-array length %d\n", start, nextstart, subarraylen); return; } // if if ((start >= length)||(nextstart > length)) { printf("\nSub-array indices incompatible with array length, start index: %d, next start: %d, array length: %d\n", start, nextstart, length); return; } // if if (subarraylen & (subarraylen -1)) // non-zero if not power of 2 { printf("\nInvalid sub-array length, not a power of 2, start index: %d, next start: %d, sub-array length: %d\n", start, nextstart, subarraylen); return; } // if //printf("fftProc(): start %d, nextstart %d, oddadd %d, startP2 %d, toffset %d, tinc %d, realonly %d\n", // start,nextstart, oddadd, startP2, toffset, twinc, realonly); /* * Processing */ int bi; // index where this butterfly starts int ei; // index of even int oi; // index of odd fft_cpx temp; // holds current twiddles value int tstart = toffset; // position in twiddles array from which to begin int tindex; // index into twiddles array int tlimit; // tindex should be less than this int tinc = twinc; // increment used stepping through twiddles array int lasttinc = 1; // when tinc gets to this, on last butterfly int oddoffset; // amount to add to even index to get odd index in inputarray int butterflysize; // holds current butterfly size int bsh; // half of the butterfly size float norm; // used to hold normalizing divisor register float twre; // holds real part of twiddles entry register float twim; // holds imaginary part of twiddles entry fft_cpx * arranged = cin; fft_cpx * rain = (void *)FFT_RAIN; int * map = (void *)FFT_MAP; /* * If this is the second pass, do post processing of FFT result obtained so far * N.B. the FFT data being processed is in the second half of the underlying array, * the FFT result is put in the first half of the underlying array plus the Nyquist * component at index [length], the remaining entries in the underlying array * are not updated with negative frequencies, these are just the complex conjugates * of the positive frequencies. */ if(secondpass) { float value; int i = start; int j = (length << 1) - start; // if at start of twiddles array, avoid need to use value at index [2*length] if(0 == start) { value = (arranged[0].re + arranged[length].re + arranged[0].im + arranged[length].im)/2.0; arranged[0].im = (arranged[0].im - arranged[length].im - arranged[0].re + arranged[length].re)/2.0; arranged[0].re = value; i++; j--; } // do the remaining entries for(; i < nextstart; i++) { value = (arranged[i].re + arranged[j].re + twiddles[i].re*(arranged[i].im + arranged[j].im) + twiddles[i].im*(arranged[i].re - arranged[j].re))/2.0; arranged[i].im = (arranged[i].im - arranged[j].im + twiddles[i].im*(arranged[i].im + arranged[j].im) - twiddles[i].re*(arranged[i].re - arranged[j].re))/2.0; arranged[i].re = value; j--; } // for // Nyquist case if(length == nextstart) { arranged[length].re = arranged[length].re - arranged[length].im; arranged[length].im = 0; } // if return; } /* * Do FFT - only get here if this is not second pass * * N.B. If the original data was in-phase only, * the underlying array is assumed to be 2*length */ // if real only is true, adjust the twiddle array steps if(realonly) { tstart <<= 1; // twiddles was set up for full-length complex array tinc <<= 1; // rather than for half length reals only lasttinc <<= 1; } // if if(arrange) { int i; for(i = start; i < nextstart; i++) { rain[i] = arranged[map[i]]; } // for arranged = rain; } // if // double size of butterfly each time around outer loop for this sub-array for(butterflysize = 1 << startP2; butterflysize <= subarraylen; butterflysize <<= 1) { bsh = butterflysize >> 1; oddoffset = bsh + oddadd; if(lasttinc != tinc) // not last butterfly { // do all the butterflies in this sub-array for this size of butterfly for( bi = start; bi < nextstart; bi += butterflysize) { ei = bi; oi = ei + oddoffset; tindex = tstart; tlimit = tstart + bsh*tinc; // as log n repeats, worth doing first case separately if(0 == tindex) { tindex += tinc; temp.re = arranged[oi].re; temp.im = arranged[oi].im; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // if // doing each butterfly while(tindex < tlimit) { twre = twiddles[tindex].re; twim = twiddles[tindex].im; tindex += tinc; temp.re = arranged[oi].re*twre - arranged[oi].im*twim; temp.im = arranged[oi].re*twim + arranged[oi].im*twre; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // while } // for } else { // last butterfly, only one butterfly to do. Normalize if requested. ei = start; oi = ei + oddoffset; tindex = tstart; tlimit = tindex + lasttinc*bsh; // lasttinc will be 1 or 2 if(normalise) { norm = (float)(realonly ?(1 << (lenP2 + 1)): 1 << lenP2); // not worth doing single case separately // if(0 == tindex) // { // tindex += tinc; // // temp.re = arranged[oi].re; // temp.im = arranged[oi].im; // // arranged[oi].re = (arranged[ei].re - temp.re)/norm; // arranged[oi++].im = (arranged[ei].im - temp.im)/norm; // arranged[ei].re += temp.re; // arranged[ei].re /= 2.0; // arranged[ei].im += temp.im; // arranged[ei++].im /= 2.0; // } // if while(tindex < tlimit) { // twre = twiddles[tindex].re; // not faster // twim = twiddles[tindex].im; // // tindex += lasttinc; // // temp.re = arranged[oi].re*twre // - arranged[oi].im*twim; // temp.im = arranged[oi].re*twim // + arranged[oi].im*twre; temp.re = arranged[oi].re*twiddles[tindex].re - arranged[oi].im*twiddles[tindex].im; temp.im = arranged[oi].re*twiddles[tindex].im + arranged[oi].im*twiddles[tindex].re; tindex += lasttinc; arranged[oi].re = (arranged[ei].re - temp.re)/norm; arranged[oi++].im = (arranged[ei].im - temp.im)/norm; arranged[ei].re += temp.re; arranged[ei].re /= norm; arranged[ei].im += temp.im; arranged[ei++].im /= norm; } // while } else { // not normalizing // not worth doing single case separately // if(0 == tindex) // { // tindex += tinc; // // temp.re = arranged[oi].re; // temp.im = arranged[oi].im; // // arranged[oi].re = arranged[ei].re - temp.re; // arranged[oi++].im = arranged[ei].im - temp.im; // arranged[ei].re += temp.re; // arranged[ei++].im += temp.im; // } // if while(tindex < tlimit) { // twre = twiddles[tindex].re; // not faster // twim = twiddles[tindex].im; // // tindex += lasttinc; // // temp.re = arranged[oi].re*twre // - arranged[oi].im*twim; // temp.im = arranged[oi].re*twim // + arranged[oi].im*twre; temp.re = arranged[oi].re*twiddles[tindex].re - arranged[oi].im*twiddles[tindex].im; temp.im = arranged[oi].re*twiddles[tindex].im + arranged[oi].im*twiddles[tindex].re; tindex += lasttinc; arranged[oi].re = arranged[ei].re - temp.re; arranged[oi++].im = arranged[ei].im - temp.im; arranged[ei].re += temp.re; arranged[ei++].im += temp.im; } // while } // else // if original data was in-phase only, with no quadrature component, // make a copy of the FFT result to be ready for post-processing // N.B. the underlying array needs to be twice the current array size if(realonly) { int i,j,k,m; // copying originals into second half of underlying array j = start + length; m = start + oddoffset; k = start + oddoffset + length; for(i = start; i < nextstart; i++){ arranged[j].re = arranged[i].re; arranged[j++].im = arranged[i].im; arranged[k].re = arranged[m].re; arranged[k++].im = arranged[m++].im; } // for } // if } // else tinc >>= 1; // as the butterfly size is doubled the steps are halved } // for } // fftProc() // see fft.h for description void fftMulti(fft_cpx* cin, const int lenP2, const fft_cpx* twiddles, const int whichcpus, const int normalise, const int arrange, const int realonly) { /* * Input validation */ if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nfftMulti():Invalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if if ((0 == whichcpus)||(whichcpus & ~CPUS_MASK)) { printf("\nInvalid choice of CPUs, input : 0x%x, must be between 0 and 0x%x\n", whichcpus, CPUS_MASK); return; } // if /* * Processing */ int length = 1 << lenP2; int blocksize; int numcpus = getNumCpus(whichcpus); int first,second,next; fft_cpx * use; if(arrange) { use = (void *)FFT_RAIN; } else { use = cin; } // else switch (numcpus) { case 1: blocksize = 1 << lenP2; // length // main processing giveToCpu(whichcpus, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly,0); // MUST WAIT FOR THiS TO HAVE FINISHED fftWait(whichcpus); if(realonly) { giveToCpu(whichcpus, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly,1); } // if // MUST WAIT FOR THiS TO HAVE FINISHED fftWait(whichcpus); break; case 2: case 3: blocksize = 1 << (lenP2 - 1); // half of the length // get the CPU to use if (0 != (whichcpus & 8))first = 8; // 1xxx else if (0 != (whichcpus & 4))first = 4; // 01xx else first = 2; // 001x next = first >> 1; if (0 != (whichcpus & next))second = next; else if (0 != (whichcpus & (next >> 1)))second = (next >> 1); else second = 1; // main processing - most of the time is spent here giveToCpu(first, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(second, cin, lenP2, blocksize, length, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly,0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish // combine results giveToCpu(first, use, lenP2, 0, blocksize, blocksize >> 1, lenP2-1, twiddles, 0, 1, normalise, 0, realonly, 0); giveToCpu(second, use, lenP2, blocksize >> 1, (blocksize >> 1) + blocksize, blocksize >> 1, lenP2-1, twiddles, blocksize >> 1, 1, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish // complete the real only processing if(realonly) { giveToCpu(first, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(second, use, lenP2, blocksize, length, 0, 1, twiddles, blocksize, 2, normalise, 0, realonly, 1); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish } // if break; case 4: blocksize = 1 << (lenP2 - 2); // quarter of the length // main processing - most of the time is spent here giveToCpu(8, cin, lenP2, 0, blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(4, cin, lenP2, blocksize, blocksize << 1, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(2, cin, lenP2, blocksize<<1, (blocksize << 1) + blocksize, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); giveToCpu(1, cin, lenP2, (blocksize << 1) + blocksize, length, 0, 1, twiddles, 0, 1 << (lenP2-1), normalise, arrange, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // combine results giveToCpu(8, use, lenP2, 0, blocksize, blocksize>>1, lenP2 - 2, twiddles, 0, 2, normalise, 0, realonly, 0); giveToCpu(4, use, lenP2, blocksize>>1, (blocksize >> 1) + blocksize, blocksize >> 1, lenP2 -2, twiddles, blocksize, 2, normalise, 0, realonly, 0); giveToCpu(2, use, lenP2, blocksize<<1, 3*blocksize, blocksize >> 1, lenP2 -2, twiddles, 0, 2, normalise, 0, realonly, 0); giveToCpu(1, use, lenP2, (blocksize <<1) + (blocksize >> 1), 3 * blocksize + (blocksize >> 1), blocksize >> 1, lenP2 -2, twiddles, blocksize, 2, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // combine again giveToCpu(8, use, lenP2, 0, blocksize, 3*(blocksize >> 1), lenP2-2, twiddles, 0, 1, normalise, 0, realonly, 0); giveToCpu(4, use, lenP2, blocksize >> 1,(blocksize >> 1) + blocksize, 3*(blocksize >> 1), lenP2-2, twiddles, blocksize>>1, 1, normalise, 0, realonly, 0); giveToCpu(2, use, lenP2, blocksize,(blocksize<<1), 3*(blocksize >> 1), lenP2-2, twiddles, blocksize, 1, normalise, 0, realonly, 0); giveToCpu(1, use, lenP2 , blocksize + (blocksize >> 1),(blocksize << 1) + (blocksize >>1), 3*(blocksize >> 1), lenP2-2, twiddles, 3*(blocksize >> 1), 1, normalise, 0, realonly, 0); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(0xf); // wait for all to finish // complete the real only processing if(realonly) { giveToCpu(8, use, lenP2, 0, blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(4, use, lenP2, blocksize, blocksize << 1, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(2, use, lenP2, blocksize<<1, 3*blocksize, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); giveToCpu(1, use, lenP2 , 3*blocksize,blocksize << 2, 0, 1, twiddles, 0, 2, normalise, 0, realonly, 1); // MUST WAIT FOR THESE TO HAVE FINISHED fftWait(first|second); // wait for both to finish } // if break; } // switch } // fftMulti() // see fft.h for description void giveToCpu(const int which, // 1,2,4 or 8 fft_cpx* cin, const int lenP2, // input array const int begin, const int nextbegin, // part indices const int oddadd, const int startP2, const fft_cpx* twiddles, const int toffset, const int twinc, const int normalise, const int arrange, const int realonly, const int secondpass) { if(0 == which) return; // nothing to do this time if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\ngiveToCpu(): Invalid power of 2 for array length: %d, should be from %d to %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); return; } // if // get the index into the FFT_FLAGS int cpuindex; if(1 == which) cpuindex = 0; else if(2 == which) cpuindex = 1; else if(4 == which) cpuindex = 2; else if(8 == which) cpuindex = 3; else return; volatile fft_cpu *thisCpu = (void *)(FFT_FLAGS + cpuindex*sizeof(fft_cpu)); thisCpu->cinadrs = (unsigned int)cin; thisCpu->lenP2 = (unsigned int)lenP2; thisCpu->begin = (unsigned int)begin; thisCpu->nextbegin = (unsigned int)nextbegin; thisCpu->oddadd = (unsigned int)oddadd; thisCpu->startP2 = (unsigned int)startP2; thisCpu->twadrs = (unsigned int)twiddles; thisCpu->toffset = (unsigned int)toffset; thisCpu->tinc = (unsigned int)twinc; thisCpu->normalise = (unsigned int)normalise; thisCpu->arrange = (unsigned int)arrange; thisCpu->realonly = (unsigned int)realonly; thisCpu->secondpass = (unsigned int)secondpass; thisCpu->spare2 = 0xa5a5a5a5; // for debug thisCpu->status = (unsigned int)FFT_START; // do this last // if the CPU is the one running this code, do the FFT here if(START_CPU_INDEX == cpuindex) { thisCpu->status |= FFT_START; thisCpu->spare1 = 0xbabecafe; // for debug fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2, thisCpu->begin, thisCpu->nextbegin, thisCpu->oddadd, thisCpu->startP2, (void *)thisCpu->twadrs, thisCpu->toffset, thisCpu->tinc, thisCpu->normalise, thisCpu->arrange, thisCpu->realonly, thisCpu->secondpass); thisCpu->spare2 = 0xcafebabe; // for debug thisCpu->status &= ~FFT_WORK_MASK; // finished } else { *((int *)IRQMP_ADRS) = 0x380b0000 | which; } // else } // giveToCpu() // see fft.h for description void fftWait(int whichcpus){ // other CPUS loop forever, check for work, do it, set done status volatile fft_cpu *cpu0 = (void *)(FFT_FLAGS); volatile fft_cpu *cpu1 = (void *)(FFT_FLAGS + sizeof(fft_cpu)); volatile fft_cpu *cpu2 = (void *)(FFT_FLAGS + 2*sizeof(fft_cpu)); volatile fft_cpu *cpu3 = (void *)(FFT_FLAGS + 3*sizeof(fft_cpu)); // volatile fft_cpu * cpus = (void *)FFT_FLAGS; int notdone = 0; do { notdone = (8 & whichcpus)? FFT_WORK_MASK & cpu3->status : 0; notdone |= (4 & whichcpus)? FFT_WORK_MASK & cpu2->status : 0; notdone |= (2 & whichcpus)? FFT_WORK_MASK & cpu1->status: 0; notdone |= (1 & whichcpus)? FFT_WORK_MASK & cpu0->status: 0; } while (notdone); return; } // fftWait() // see fft.h for description void fftRealReturn(fft_cpx * cin, const int halflenP2, fft_cpx * twiddles) { fft_scalar value; int i,j; int length = 1 << halflenP2; // results overwrite originals that are needed subsequently, // so copy originals into second half of underlying array j = length; for(i =0; i <= length; i++){ cin[j].re = cin[i].re; cin[j++].im = cin[i].im; } // for j = (length << 1) +1; for(i = 0; i < length; i++) { j--; value = (cin[i].re + cin[j].re + twiddles[i].re*(cin[i].im + cin[j].im) + twiddles[i].im*(cin[i].re - cin[j].re))/2.0; cin[i].im = (cin[i].im - cin[j].im + twiddles[i].im*(cin[i].im + cin[j].im) - twiddles[i].re*(cin[i].re - cin[j].re))/2.0; cin[i].re = value; } // for // Nyquist case // if they existed, twiddles[length].re would be -1, twiddles[length].im would be 0; cin[length].re = cin[length].re - cin[length].im; cin[length].im = 0; // copy positive frequency results to negative, doing complex conjugate for(i = 1; i < length ; i++) { cin[length+i].re = cin[length-i].re; cin[length+i].im = -cin[length-i].im; } // for return; } // fftRealReturn() /* * UTILITIES */ // see fft.h for description int getNumCpus(const int whichcpus) { // set up count of cpus to use int numcpus = 0; // how many cpus to use numcpus += (whichcpus & 1)? 1 : 0; numcpus += (whichcpus & 2)? 1 : 0; numcpus += (whichcpus & 4)? 1 : 0; numcpus += (whichcpus & 8)? 1 : 0; return numcpus; } // getNumCpus // see fft.h for description int settingsIncorrect(const int lenP2, int *cpusrequest, const int numruns, const int type, int* lines) { int result = 0; // array size is limited by the way memory is being allocated at present if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2)) { printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n", lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2); result |= 1; } // if int whichcpus = *cpusrequest; if((0 == whichcpus)||(whichcpus & ~CPUS_MASK)) { printf("\nInvalid choice of CPUs: 0x%x, must be from 1 to 0x%x\n", whichcpus, CPUS_MASK); result |= 1; } // if // get the count of cpus being requested int numcpus = getNumCpus(whichcpus); // only allow 1, 2 or 4 cpus, if 3 deselect the lowest order one if(3 == numcpus) { if (whichcpus & 1) whichcpus &= 0xe; // deselect CPU 0 else whichcpus &= 0xd; // deselect CPU 1 *cpusrequest = whichcpus; numcpus = 2; printf("\nChoice of 3 CPUs invalid, will use these 2 CPUs : 0x%x\n", whichcpus); } // if // find how many CPUs are present on system volatile int status = *(int *)IRQMP_ADRS; // current system configuration int statuscpus = (((status & CPUS_PM1_MASK)>>CPUS_PM1_SHIFTS)&0xf) + 1; printf("\nMain, Status: 0x%x, CPUs present %d, CPUs requested: %d, WhichCPUs: 0x%x\n", status,statuscpus,numcpus,whichcpus); // debug if((0 == numcpus)||(numcpus > statuscpus)) // sanity check { printf("Abandoning, number of CPUs requested: %d, number present %d",numcpus, statuscpus); result |= 1; } // if // check the type if((0 != type)&&(1 != type)) { printf("Abandoning, invalid type input: %d, should be 0 for forward FFT or 1 for inverse FFT\n",type); result |= 1; } // if // check the choice of number of lines to display if(0 >= *lines) { printf("Number of lines chosen was %d, no data or result will be displayed\n",*lines); } else if ((1 << lenP2) < *lines){ printf("Number of lines chosen %d exceeds array length %d, array length will be used\n",*lines,1<<lenP2); *lines = 1 << lenP2; } return result; } // settingsIncorrect() // see fft.h for description void showC(const fft_cpx* in, const int length, const double tolerance) { int i=0; float temp1 = 0.0; float temp2 = 0.0; printf("Complex numbers array, showing first %d elements\n", length); for (i=0; i<length; i++) { temp1 = in[i].re; if( ((0.0 < temp1)&&( tolerance > temp1)) ||((0.0 > temp1)&&(-tolerance < temp1))){ temp1 = 0.0; } // if temp2 = in[i].im; if( ((0 < temp2)&&( tolerance > temp2)) ||((0 > temp2)&&(-tolerance < temp2))){ temp2 = 0.0; } // if printf("%d: %g %g\n", i, temp1, temp2); } // for } // showC() // see fft.h for description void show8C(const fft_cpx* in, const int length, const double tolerance, const int realonly) { if(32 > length) { showC(in,length,tolerance); } // if int i,k; fft_cpx temp; float temp1 = 0.0; float temp2 = 0.0; printf("Complex numbers array, showing certain elements\n\n"); int nyindex = length>>1; int begins[4] = {0, nyindex - 8, nyindex, length-8}; for(k =0; k<4; k++) { for (i=0; i<8; i++) { temp = in[begins[k]+i]; temp1 = temp.re; if( ((0.0 < temp1)&&( tolerance > temp1)) ||((0.0 > temp1)&&(-tolerance < temp1))){ temp1 = 0.0; } // if temp2 = temp.im; if( ((0 < temp2)&&( tolerance > temp2)) ||((0 > temp2)&&(-tolerance < temp2))){ temp2 = 0.0; } // if printf("%d:\t\t\t%g %g\n", begins[k]+i, temp1, temp2); if((0 == begins[k]+i)||(nyindex == begins[k]+i)) printf("\n"); if(realonly && (nyindex == begins[k] + i)) { printf("In-phase data only, last frequency shown above is Nyquist\n"); printf("Negative frequencies not shown (complex conjugates of positive frequencies)\n\n"); break; // for loop } // if } // for printf("\n"); if(realonly && (2 == k)) break; } // for } // show8C()