FFTCPU0.c

From wiki
Jump to navigation Jump to search
/*
 ============================================================================
 Name        : FFTCPU0.c
 Author      : Michael
 Version     :
 Copyright   : Copyright OCE Technology
 Description : Multicore FFT

 This program demonstrates the use of 1, 2, or 4 CPUs on a S698PM
 multi-core processor to calculate a Fast Fourier Transform (FFT).

 A selection of different input arrays is provided,
 additional input arrays can be created easily.

 At present the input array length must be a power of 2,
 and the scalar type must be C float.

 The functions used are defined in fft.h and documented there.
 Please consult fft.h for more information

 ============================================================================
 */
#include <stdio.h>
#include <stdlib.h>
#include "fft.h"

/*
 * main()
 *
 * Usage:
 *
 * 		Assign values to each of the variables below
 *
 * 			lenP2	 		the power of 2 that gives the length of the input array
 *							(at present from 5 to 17, i.e. length 32 to 131,072)
 *
 * 			whichcpus		the CPUs to use, from 1 to 0xf (binary xxxx, 1 = use)
 *
 * 			numruns			number of test runs - timings are given for total runs
 *
 * 			type 			0 for forward FFT, 1 for inverse FFT
 *
 * 			normalise		0 for not normalized, 1 for normalized
 *
 * 			arrange			0 if data already arranged, 1 if need to arrange
 *
 *    		realonly		0 for (in-phase, quadrature) input, 1 for (in-phase, all 0);
 *
 * 			displayLines	number of lines of data to display
 *
 * 		Set up the complex data input in cin.
 * 		(a number of choices are already set up and commented out, it is simple to add more)
 *
 *		Compile the program.
 *
 *		Use the DMON script provided to load and run the program
 *		This also sets up the CPU stacks and enables the L2 cache.
 *
 *		N.B. L2 cache is disabled by default,
 *			 if not switched on timings are affected dramatically.
 *
 */
int main(int argc,char ** argv)
{
	/*
	 * Initial entry point for all CPUs  -- DO NOT CHANGE
	 */
	if(START_CPU_INDEX != fftCpusStart())		// N.B. only START_CPU returns from this call
	{
		printf("\nAbandoning program, startup problem\n");
		return 0;
	}	// if

	/*
	 * START HERE	- select input size, CPUs to use, number of test runs
	 */
	int lenP2 = 12;					// input array length is this power of 2 (from 5 to 17 at present)

	int whichcpus = 0xf;			// CPUs to use, hex value xxxx, x = 1 to turn on, 0xf for all on

    const int numruns = 200;		// number of test runs

    const int type = 0;				// 0 for forward FFT, 1 for inverse FFT

    const int normalise = 0;		// 0 for not normalized, 1 for normalized

    const int arrange = 1;			// 0 for don't arrange, 1 for arrange

    const int realonly = 0;			// 0 for (in-phase, quadrature), 1 for (in-phase,all 0);

    int displayLines = 64;			// number of lines of the data to display, if enabled below

    // check input and correct if possible, give message and exit if settings incorrect
    if(settingsIncorrect(lenP2, &whichcpus, numruns, type, &displayLines)) return 0;

	/*
	 *  VARIOUS EXAMPLES
	 *  Example lengths depend on lenP2 above
	 *  The examples not in use are commented out
	 *  Other input choices can be set up as (in-phase, quadrature)in cin as desired
	 *  N.B. If realonly is not selected both components should be set up,
	 *  	 otherwise previous data may cause anomalous results.
	 *  	 If realonly is selected, the quadrature part of the input is overwritten
	 *  	 with 0s.
	 */
	fft_cpx * cin = (void *)FFT_CIN;	// start of complex number input data array
	int length = 1 << lenP2;			// length of complex numbers input array

    int i;

	// test: combination of frequencies and phases
	//for (i=0;i<length;++i) {
//		cin[i].re = 1.0 + sin(M_PI*i/(length >> 1))			// in-phase part
	//				    + 3*cos(M_PI*2*i/(length >> 1))
	//				    + 2*cos(M_PI*6*i/(length >> 1))
	//					+ 7*cos(M_PI*((length >> 1) - 1)*i/(length >> 1))
	//				    + 5*cos(M_PI*i);					// Nyquist
	//    cin[i].im = sin(M_PI*3*i/(length >> 1));			// quadrature part
//	}	// for

	// test: ramp
//	for (i=0;i<length;++i) {
//		cin[i].re = i;
//	    cin[i].im = 0;
//	}	// for

	// test: random
//	srand(time(0));
//	for (i=0;i<length;++i) {
//      cin[i].re = rand();
//      cin[i].im = 0;
//  }	// for
// square wave
	for (i=0;i<(length >> 1);++i) {
      cin[i].re = 3;
      cin[i].im = 0;
  }	// for

  	for (i=(length >> 1); i < length;++i) {
      cin[i].re = 0;
      cin[i].im = 0;
  }	// for
  
  
	// show the input data
//	printf("Complex data:\n");
//	showC(cin, displayLines, 0);

	/*
	 * Do FFT test
	 * Does numruns, first with one processor, then with the requested processors (whichcpus)
	 */
	// set up the twiddles array - its length is half that of the input array.
	fft_cpx * tw = (void *)FFT_TWIDDLES;	// preassigned space in SRAM starting at this address
	fillTwiddles(tw,lenP2,type);			// 0 for forward FFT, 1 for inverse FFT

	// if data is in-phase only, i.e. no quadrature component, pack into first half
	if(realonly)
	{
		fftRealOnlyArrange(cin,lenP2);
		lenP2 -= 1;							// half the size, but leave value of length as is
	}	// if

	// if need to arrange, set up mapping array - N.B. it is its own inverse
	if(arrange)
	{
		fftArrangeIndex((void *)FFT_MAP, lenP2);
	}	// if

	// set up a pointer for rearranged input and clear it
	// if 'arrange' set result will be here
	fft_cpx * rain = (void *)FFT_RAIN; 		// preassigned space in SRAM starting at this address
	for(i = 0; i < length; i++)
	{
		rain[i].re = 0;
		rain[i].im = 0;
	}

//	int map[1 << lenP2];
//	fftArrangeIndex(map, lenP2);
//	for (i=0;i<length;++i) {
//		rain[map[i]] = cin[i];
//	}	// for

	// start the cpus being used other than this one - they will halt waiting for work
	startCpus(whichcpus);

	// get ready to time
	clock_t start, finish;					// start and finish times
	unsigned long single, multiple;			// times with single, multiple C{Us

	printf("\n\nDoing %d runs of %d length FFT with 1 processor\n", numruns, length);

	start = clock();
	for(i = 0; i < numruns; i++)
	{
		if(arrange)							// will do rearrangement in parallel
		{
			// do it using this CPU, the startup CPU, usually CPU 0
			fftMulti(cin, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly);
		} else {

			// first, rearrange the input into rain
			fftArrangeOut(cin, rain,lenP2);

			// do it using this CPU, the startup CPU, usually CPU 0
			fftMulti(rain, lenP2, tw, 1<<START_CPU_INDEX, normalise, arrange, realonly);
		}	// else
	}	// for

	finish = clock();
	single = finish - start;

	printf("\n\nDone %d runs of %d length FFT with 1 processor, time: %ld\n", numruns, length, single);

	int numcpus = getNumCpus(whichcpus);	// how many CPUs being used

	printf("\nDoing %d runs of %d length FFT with %d processors\n", numruns, length, numcpus);

	start = clock();
	for(i = 0; i < numruns; i++)
	{

		if(arrange)						// will do rearrangement in parallel
		{
			// do it using selected cpus
			fftMulti(cin, lenP2, tw, whichcpus, normalise, arrange, realonly);
		} else {
			// first, rearrange the input
			fftArrangeOut(cin, rain, lenP2);
			fftMulti(rain, lenP2, tw, whichcpus, normalise, arrange, realonly);
		}	// else
	}	// for

	finish = clock();
	multiple = finish - start;

	/*
	 * FINISH - show FFT result and times
	 */
	printf("Complex FFT:\n");
	show8C(rain, length, 0.001, realonly);

	printf("\n\n%d runs of %d length FFT with %d processor, time: %ld\n", numruns, length, 1, single);
	printf("\n%d runs of %d length FFT with %d processors, time: %ld\n", numruns, length, numcpus, multiple);
	if(realonly)
	{
		printf("\nSpeedup factor for %d real data with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple);
	} else {
		printf("\nSpeedup factor for %d (in-phase,quadrature) with %d processors: %2.2f\n", length, numcpus, ((float)single)/(float)multiple);
	}	// else

	return 0;

}	// main()


// see fft.h for description
static inline unsigned int get_asr17(void)
{
  unsigned int reg;
  __asm__ (" mov %%asr17, %0 " : "=r"(reg) :);
  return reg;
}


// see fft.h for description
int fftCpusStart(void)
{
	volatile unsigned int status = 0;
	volatile fft_cpu *thisCpu;

	// find out which CPU is running
	int LEON3_Cpu_Index = (get_asr17() >> 28) & 3;		// max 4 cpus, 0 .. 3

	// set up reference to data for this CPU
	thisCpu = (void *)(FFT_FLAGS + LEON3_Cpu_Index*sizeof(fft_cpu));

	// if start CPU (usually CPU0), initialize flags area
	if (START_CPU_INDEX == LEON3_Cpu_Index){
		// START_CPU (usually 0) initializes settings and returns
		volatile fft_cpu *thisCpu;
		int i = 0;
		for(i = 0; i < MAX_CPUS; i++)
		{
			thisCpu = (void *)(FFT_FLAGS + i*sizeof(fft_cpu));

			thisCpu->status = 0;
			thisCpu->cinadrs = 0;
			thisCpu->lenP2 = 0;
			thisCpu->begin = 0;
			thisCpu->nextbegin = 0;
			thisCpu->oddadd = 0;
			thisCpu->startP2 = 0;
			thisCpu->twadrs = 0;
			thisCpu->toffset = 0;
			thisCpu->tinc = 0;
			thisCpu->normalise = 0;
			thisCpu->arrange = 0;
			thisCpu->realonly = 0;
			thisCpu->spare1 = 0x01010101;
			thisCpu->spare2 = 0x10101010;
		}	// for

	}else {
		// other CPUS loop forever, check for work, do it, set done status
		while(1)
		{
			thisCpu->spare1 = 0xcafe;						// for debug

			status = thisCpu->status;
			if(FFT_START == (FFT_WORK_MASK & status))
			{
				thisCpu->spare1 = 0xbabecafe;				// for debug

				fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2,
						thisCpu->begin, thisCpu->nextbegin,
						thisCpu->oddadd, thisCpu->startP2,
						(void *)thisCpu->twadrs,
						thisCpu->toffset, thisCpu->tinc,
						thisCpu->normalise,thisCpu->arrange,
						thisCpu->realonly, thisCpu->secondpass);

				thisCpu->spare2 = 0xcafebabe;				// for debug
				thisCpu->status &= ~FFT_WORK_MASK;			// finished

			}	// if

			// halt this CPU, will be restarted when work available
			__asm__ (" wr %g0, %asr19 ");

		}	// while

	}	// else

	return LEON3_Cpu_Index;		// should only get here for CPU 0

}	// fftCpusStart()


// see fft.h for description
void startCpus(const int whichcpus)
{
	int i;
	int LEON3_Cpu_Index = (get_asr17() >> 28) & 3;		// max 4 cpus, 0 .. 3
	int this_cpu = 1 << LEON3_Cpu_Index;

	int current_cpu = LAST_CPU;							// usually 8

	while(current_cpu)
	{
		for(i=0; i < CPU_START_PAUSE; i++){}						// pause
		if((current_cpu & whichcpus)&&(current_cpu != this_cpu))
		{
			*((int *)IRQMP_ADRS) = 0x380b0000 | current_cpu;
		}	// if

		current_cpu >>= 1;

	}	// while

	for(i=0; i < CPU_START_PAUSE; i++){}							// pause
	return;
}	// startCpus()


/*
 * INPUT ARRAY REARRANGEMENT
 */

// see fft.h for description
void fftArrangeOut(fft_cpx* cin, fft_cpx* result, int lenP2)
{

	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	int len = 1 << lenP2;
	int mask = len -1;

	int i = 0;
    for(i = 0; i < len; i++){

    	/*
    	 * Find the index to swap with.
    	 * - treating index as potentially 32 bits,
    	 *   number of bits in use should be less
    	 */
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);				// N.B. arithmetic shift
    	v &= mask;

	    if(v >= i)					// otherwise already done
	    {
	    	result[i] = cin[v];
	    	result[v] = cin[i];
	    }	// if

    }	// for

    return;

}	// fftArrangeOut()


// see fft.h for description
void fftArrangeIn(fft_cpx* x, int lenP2)
{
	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// do it
	int len = 1 << lenP2;			// power of 2
	int mask = len - 1;

	fft_cpx temp;					// for swap

	int i = 0;
    for(i = 0; i < len; i++){		// must check full array, not just half

    	// Find the index to swap with
    	// - treating index as no more than 32 bits,
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);			// arithmetic shift
    	v &=  mask;					// kill off any leading 1s due to arithmetic shift

	    if(v > i)					// otherwise already done or same
	    {
	    	temp = x[i];
	    	x[i] = x[v];
	    	x[v] = temp;
	    }	// if

    }	// for

    return;

}	// fftArrangeIn()


// see fft.h for description
void fftArrangeIndex(int* mapindex, int lenP2)
{
	// validation check
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// do it
	int len = 1 << lenP2;			// power of 2
	int mask = len - 1;

	int i = 0;
    for(i = 0; i < len; i++){		// must check full array, not just half

    	// Find the index to swap with
    	// - treating index as no more than 32 bits,
    	int v = i;
    	// swap odd and even bits
    	v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
    	// swap consecutive pairs
    	v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
    	// swap nibbles ...
    	v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
    	// swap bytes
    	v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
    	// swap 2-byte long pairs
    	v = ( v >> 16             ) | ( v               << 16);

    	v >>= (32 - lenP2);			// arithmetic shift
    	v &=  mask;						// kill off any leading ones

    	mapindex[i] = v;

    }	// for

    return;

}	// fftArrangeIndex()


// see fft.h for description
void fftRealOnlyArrange(fft_cpx * cin, int lenP2)
{
	int i,j;
	int halflen = 1 << (lenP2-1);

	for (i = 0; i < halflen; i++)
	{
		j = i << 1;
		cin[i].re = cin[j].re;
		cin[i].im = cin[j+1].re;
	}	// for

	for (i=halflen; i < (1 << lenP2); i++)
	{
		cin[i].re = 0;
		cin[i].im = 0;
	}	// for

}	// fftRealOnlyArrange()


/*
 * TWIDDLES ARRAY
 */

// see fft.h for description
void fillTwiddles(fft_cpx* twiddles, const int lenP2, const int type)
{
	// validation checks
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfillTwiddles(): Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

    if ((0 != type)&& (1 != type))		// invalid, should be forward(0) or inverse(1)
    {
		printf("\nInvalid type for twiddles array, should be 0 for forward, 1 for inverse, actual was: 0x%x\n",
				type);
		return;
    }	// if

    /*
     * Do it - using doubles
     */
    int n = 1 << lenP2;
    double twoPiDivNsigned;

    if(1 != type)						// negative exponent for forward transform, positive for inverse
    {
        twoPiDivNsigned= -M_PI / (double)(n >> 1);  // 2*Pi/n
    } else {
        twoPiDivNsigned = M_PI / (double)(n >> 1);	// positive exponent for inverse transform
    }	// else

    double root2Inv  = 1.0/sqrt(2.0);	// quicker than sine or cosine

    /*
     * Fill in the values
     */
    int quarter;						// will be N/4, quarter way round
    int eight;							// will be N/8, eight of way around

    twiddles[0].re = 1.0;				// always same
    twiddles[0].im = 0;

    if(1 != type)						// forward FFT - N.B. using negative exponent
    {
    	quarter = n >> 2;
        twiddles[quarter].re = 0;		// quarter way around with negative exponent (0,-i)
        twiddles[quarter].im = -1;

        eight = quarter >> 1;			// eight of way around (1/root2, -i*1/root2)
        twiddles[eight].re = root2Inv;
        twiddles[eight].im = -root2Inv;

        twiddles[quarter+eight].re = -root2Inv;	// three eights of way around (-1/root2, -i*1/root2)
        twiddles[quarter+eight].im = -root2Inv;

        int j;
        for(j=1; j<eight; j++)
        {
        	double theta = (double)j * twoPiDivNsigned;	// negative

        	twiddles[j].re = cos(theta);			// positive
        	twiddles[j].im = sin(theta);			// negative

        	int pos = quarter-j;
        	twiddles[pos].re = -twiddles[j].im;
        	twiddles[pos].im = -twiddles[j].re;

        	pos = quarter+j;
           	twiddles[pos].re =  twiddles[j].im;
            twiddles[pos].im = -twiddles[j].re;

        	pos = (quarter << 1) - j;
        	twiddles[pos].re = -twiddles[j].re;
        	twiddles[pos].im =  twiddles[j].im;
        }	// for

    } else {										// inverse FFT - N.B. uses positive exponent

        quarter = n >> 2;
        twiddles[quarter].re = 0;					// quarter way around (0,i)
        twiddles[quarter].im = 1;					// N.B. positive exponent

        eight = quarter >> 1;						// eight of way around (1/root2, i*1/root2)
        twiddles[eight].re = root2Inv;
        twiddles[eight].im = root2Inv;

        twiddles[quarter+eight].re = -root2Inv;		// three eights of way around (-1/root2, i*1/root2)
        twiddles[quarter+eight].im =  root2Inv;

        int j;
        for(j=1; j<eight; j++)
        {
        	double theta = (double)j * twoPiDivNsigned;		// positive

        	twiddles[j].re = cos(theta);			// positive
        	twiddles[j].im = sin(theta);			// positive

        	int pos = quarter-j;
        	twiddles[pos].re = twiddles[j].im;
        	twiddles[pos].im = twiddles[j].re;

        	pos = quarter+j;
        	twiddles[pos].re =  -twiddles[j].im;
        	twiddles[pos].im = twiddles[j].re;

        	pos = (quarter << 1) - j;
        	twiddles[pos].re = -twiddles[j].re;
        	twiddles[pos].im =  twiddles[j].im;
        }	// for

    }	// else

}	// fillTwiddles()


/*
 * FFT PROCESSING
 */

// see fft.h for description
void fftProc(fft_cpx* cin, const int lenP2,
			 const int start, const int nextstart,
			 const int oddadd, const int startP2,
    		 const fft_cpx* twiddles,
    		 const int toffset, const int twinc,
    		 const int normalise, const int arrange,
    		 const int realonly, const int secondpass)
{
	/*
	 * Input validation
	 */
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfftProc():Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	int length = 1 << lenP2;
	int subarraylen = nextstart - start;

    if((0 > start)||(0 > nextstart)||(0 >= subarraylen))
    {
    	printf("\nInvalid input sub-array indices, start index: %d, next start: %d, sub-array length %d\n", start, nextstart, subarraylen);
    	return;
    }	// if

    if ((start >= length)||(nextstart > length))
    {
    	printf("\nSub-array indices incompatible with array length, start index: %d, next start: %d, array length: %d\n",
    			start, nextstart, length);
    	return;
    }	// if

    if (subarraylen & (subarraylen -1))	// non-zero if not power of 2
    {
    	printf("\nInvalid sub-array length, not a power of 2, start index: %d, next start: %d, sub-array length: %d\n",
    			start, nextstart, subarraylen);
    	return;
    }	// if
//printf("fftProc(): start %d, nextstart %d, oddadd %d, startP2 %d, toffset %d, tinc %d, realonly %d\n",
//			start,nextstart, oddadd, startP2, toffset, twinc, realonly);

	/*
	 * Processing
	 */
    int bi;					// index where this butterfly starts
    int ei;					// index of even
    int oi;					// index of odd

    fft_cpx temp;			// holds current twiddles value
    int tstart = toffset;	// position in twiddles array from which to begin
    int tindex;				// index into twiddles array
    int tlimit;				// tindex should be less than this
    int tinc = twinc;		// increment used stepping through twiddles array
    int lasttinc = 1;		// when tinc gets to this, on last butterfly

    int oddoffset;			// amount to add to even index to get odd index in inputarray
    int butterflysize;		// holds current butterfly size
    int bsh;    			// half of the butterfly size

    float norm;				// used to hold normalizing divisor
    register float twre;	// holds real part of twiddles entry
    register float twim;	// holds imaginary part of twiddles entry

    fft_cpx * arranged = cin;
    fft_cpx * rain = (void *)FFT_RAIN;
    int * map = (void *)FFT_MAP;

    /*
     * If this is the second pass, do post processing of FFT result obtained so far
     * N.B. the FFT data being processed is in the second half of the underlying array,
     * 		the FFT result is put in the first half of the underlying array plus the Nyquist
     * 		component at index [length], the remaining entries in the underlying array
     * 		are not updated with negative frequencies, these are just the complex conjugates
     * 		of the positive frequencies.
     */
    if(secondpass)
    {
    	float value;
    	int i = start;
    	int j = (length << 1) - start;

    	// if at start of twiddles array, avoid need to use value at index [2*length]
    	if(0 == start)
    	{
    		value =  (arranged[0].re + arranged[length].re
    		    	  + arranged[0].im + arranged[length].im)/2.0;

    		arranged[0].im = (arranged[0].im - arranged[length].im
    		     			  - arranged[0].re + arranged[length].re)/2.0;

    		arranged[0].re = value;

    		i++;
    		j--;
    	}

    	// do the remaining entries
    	for(; i < nextstart; i++)
    	{
    		value =  (arranged[i].re + arranged[j].re
    				  + twiddles[i].re*(arranged[i].im + arranged[j].im)
    				  + twiddles[i].im*(arranged[i].re - arranged[j].re))/2.0;

    	    arranged[i].im = (arranged[i].im - arranged[j].im
     			         	  + twiddles[i].im*(arranged[i].im + arranged[j].im)
     			         	  - twiddles[i].re*(arranged[i].re - arranged[j].re))/2.0;

    	    arranged[i].re = value;

    	    j--;
    	}	// for

    	// Nyquist case
       	if(length == nextstart)
    	{
       		arranged[length].re = arranged[length].re - arranged[length].im;
       		arranged[length].im = 0;
    	}	// if

    	return;
    }

    /*
     * Do FFT - only get here if this is not second pass
     *
     * N.B. If the original data was in-phase only,
     * 		the underlying array is assumed to be 2*length
     */

    // if real only is true, adjust the twiddle array steps
    if(realonly)
    {
    	tstart <<= 1;		// twiddles was set up for full-length complex array
    	tinc <<= 1;			// rather than for half length reals only
    	lasttinc <<= 1;
    }	// if

    if(arrange)
    {
    	int i;
    	for(i = start; i < nextstart; i++)
    	{
    		rain[i] = arranged[map[i]];
    	}	// for

    	arranged = rain;
    }	// if

    // double size of butterfly each time around outer loop for this sub-array
    for(butterflysize = 1 << startP2; butterflysize <= subarraylen; butterflysize <<= 1)
    {
     	bsh = butterflysize >> 1;
     	oddoffset = bsh + oddadd;

     	if(lasttinc != tinc)	// not last butterfly
     	{
     		// do all the butterflies in this sub-array for this size of butterfly
     		for( bi = start; bi < nextstart; bi += butterflysize)
     		{
     			ei = bi;
     			oi = ei + oddoffset;

     			tindex = tstart;
     			tlimit = tstart + bsh*tinc;

     			// as log n repeats, worth doing first case separately
     	    	if(0 == tindex)
     	    	{
     				tindex += tinc;

     	    		temp.re = arranged[oi].re;
     				temp.im = arranged[oi].im;

     				arranged[oi].re = arranged[ei].re - temp.re;
     				arranged[oi++].im = arranged[ei].im - temp.im;
     				arranged[ei].re += temp.re;
     				arranged[ei++].im += temp.im;
     	    	}	// if

     			// doing each butterfly
     			while(tindex < tlimit)
     			{
     				twre = twiddles[tindex].re;
     				twim = twiddles[tindex].im;

     				tindex += tinc;

     				temp.re = arranged[oi].re*twre
     						  - arranged[oi].im*twim;
     				temp.im = arranged[oi].re*twim
     				     	  + arranged[oi].im*twre;

     				arranged[oi].re = arranged[ei].re - temp.re;
     				arranged[oi++].im = arranged[ei].im - temp.im;
     				arranged[ei].re += temp.re;
     				arranged[ei++].im += temp.im;




     			}	// while

     		}	// for

     	} else {	// last butterfly, only one butterfly to do. Normalize if requested.

     		ei = start;
     		oi = ei + oddoffset;

     		tindex = tstart;
     		tlimit = tindex + lasttinc*bsh;		// lasttinc will be 1 or 2

     		if(normalise)
     		{
     			norm = (float)(realonly ?(1 << (lenP2 + 1)): 1 << lenP2);

     			// not worth doing single case separately
//    	    	if(0 == tindex)
//     	    	{
//     				tindex += tinc;
//
//     	    		temp.re = arranged[oi].re;
//     				temp.im = arranged[oi].im;
//
//     				arranged[oi].re = (arranged[ei].re - temp.re)/norm;
//     				arranged[oi++].im = (arranged[ei].im - temp.im)/norm;
//     				arranged[ei].re += temp.re;
//     				arranged[ei].re /= 2.0;
//     				arranged[ei].im += temp.im;
//     				arranged[ei++].im /= 2.0;
//     	    	}	// if

     			while(tindex < tlimit)
     			{
//     				twre = twiddles[tindex].re;		// not faster
//     				twim = twiddles[tindex].im;
//
//     				tindex += lasttinc;
//
//     				temp.re = arranged[oi].re*twre
//     						  - arranged[oi].im*twim;
//     				temp.im = arranged[oi].re*twim
//     				     	  + arranged[oi].im*twre;

     				temp.re =  arranged[oi].re*twiddles[tindex].re
     						   - arranged[oi].im*twiddles[tindex].im;
     				temp.im =  arranged[oi].re*twiddles[tindex].im
     						   + arranged[oi].im*twiddles[tindex].re;

     				tindex += lasttinc;

     				arranged[oi].re = (arranged[ei].re - temp.re)/norm;
     				arranged[oi++].im = (arranged[ei].im - temp.im)/norm;
     				arranged[ei].re += temp.re;
     				arranged[ei].re /= norm;
     				arranged[ei].im += temp.im;
     				arranged[ei++].im /= norm;

     			}	// while

     		} else {		// not normalizing

     			// not worth doing single case separately
//     			if(0 == tindex)
//     			{
//     				tindex += tinc;
//
//     		   		temp.re = arranged[oi].re;
//     			    temp.im = arranged[oi].im;
//
//     			    arranged[oi].re = arranged[ei].re - temp.re;
//     			    arranged[oi++].im = arranged[ei].im - temp.im;
//     			    arranged[ei].re += temp.re;
//     			    arranged[ei++].im += temp.im;
//     			}	// if

     			while(tindex < tlimit)
     			{
//     				twre = twiddles[tindex].re;	// not faster
//     				twim = twiddles[tindex].im;
//
//     				tindex += lasttinc;
//
//     				temp.re = arranged[oi].re*twre
//     						  - arranged[oi].im*twim;
//     				temp.im = arranged[oi].re*twim
//     				     	  + arranged[oi].im*twre;

     				temp.re =  arranged[oi].re*twiddles[tindex].re
     			     		   - arranged[oi].im*twiddles[tindex].im;
     			    temp.im =  arranged[oi].re*twiddles[tindex].im
     			     		   + arranged[oi].im*twiddles[tindex].re;

    				tindex += lasttinc;

     			    arranged[oi].re = arranged[ei].re - temp.re;
     			    arranged[oi++].im = arranged[ei].im - temp.im;
     			    arranged[ei].re += temp.re;
     			    arranged[ei++].im += temp.im;

     			}	// while

     		}	// else

     		// if original data was in-phase only, with no quadrature component,
     		// make a copy of the FFT result to be ready for post-processing
     		// N.B. the underlying array needs to be twice the current array size
     		if(realonly)
     		{
    			int i,j,k,m;

     			// copying originals into second half of underlying array
     			j = start + length;
     			m = start + oddoffset;
     			k = start + oddoffset + length;

     			for(i = start; i < nextstart; i++){
     				arranged[j].re 	 = arranged[i].re;
     				arranged[j++].im = arranged[i].im;

     				arranged[k].re   = arranged[m].re;
     				arranged[k++].im   = arranged[m++].im;
     			}	// for

     		}	// if

     	}	 // else

     	tinc >>= 1;		// as the butterfly size is doubled the steps are halved

    }	// for

}	// fftProc()


// see fft.h for description
void fftMulti(fft_cpx* cin, const int lenP2,
			  const fft_cpx* twiddles,
    		  const int whichcpus, const int normalise,
    		  const int arrange, const int realonly)
{
	/*
	 * Input validation
	 */
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nfftMulti():Invalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	if ((0 == whichcpus)||(whichcpus & ~CPUS_MASK))
	{
	    printf("\nInvalid choice of CPUs, input : 0x%x, must be between 0 and 0x%x\n", whichcpus, CPUS_MASK);
	    return;
	}	// if


	/*
	 * Processing
	 */
	int length = 1 << lenP2;

	int blocksize;

	int numcpus = getNumCpus(whichcpus);
	int first,second,next;

	fft_cpx * use;
	if(arrange)
	{
		use = (void *)FFT_RAIN;
	} else {
		use = cin;
	}	// else

	switch (numcpus)
	{
		case 1:
			blocksize = 1 << lenP2;			// length

			// main processing
			giveToCpu(whichcpus,
					  cin, lenP2,
					  0, blocksize,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly,0);

			// MUST WAIT FOR THiS TO HAVE FINISHED
			fftWait(whichcpus);

			if(realonly)
			{
				giveToCpu(whichcpus,
						  use, lenP2,
						  0, blocksize,
						  0, 1,
						  twiddles, 0, 2,
						  normalise, 0,
						  realonly,1);
			}	// if

			// MUST WAIT FOR THiS TO HAVE FINISHED
			fftWait(whichcpus);
			break;

		case 2:
		case 3:
			blocksize = 1 << (lenP2 - 1);	// half of the length

			// get the CPU to use
			if (0 != (whichcpus & 8))first = 8;			// 1xxx
			else if (0 != (whichcpus & 4))first = 4;	// 01xx
			else first = 2;								// 001x

			next = first >> 1;

			if (0 != (whichcpus & next))second = next;
			else if (0 != (whichcpus & (next >> 1)))second = (next >> 1);
			else second = 1;

			// main processing - most of the time is spent here
			giveToCpu(first,
					  cin, lenP2,
					  0, blocksize,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly, 0);

			giveToCpu(second,
					  cin, lenP2,
					  blocksize, length,
					  0, 1,
					  twiddles, 0, 1 << (lenP2-1),
					  normalise, arrange,
					  realonly,0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(first|second);			// wait for both to finish

			// combine results
			giveToCpu(first,
					  use, lenP2,
					  0, blocksize,
					  blocksize >> 1, lenP2-1,
					  twiddles, 0, 1,
					  normalise, 0,
					  realonly, 0);

			giveToCpu(second,
					  use, lenP2,
					  blocksize >> 1, (blocksize >> 1) + blocksize,
					  blocksize >> 1, lenP2-1,
					  twiddles, blocksize >> 1, 1,
					  normalise, 0,
					  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(first|second);			// wait for both to finish

			// complete the real only processing
			if(realonly)
			{
				giveToCpu(first,
						  use, lenP2,
						  0, blocksize,
						  0, 1,
						  twiddles, 0, 2,
						  normalise, 0,
						  realonly, 1);

				giveToCpu(second,
						  use, lenP2,
						  blocksize, length,
						  0, 1,
						  twiddles, blocksize, 2,
						  normalise, 0,
						  realonly, 1);

				// MUST WAIT FOR THESE TO HAVE FINISHED
				fftWait(first|second);			// wait for both to finish
			}	// if

			break;

		case 4:

			blocksize = 1 << (lenP2 - 2);	// quarter of the length

	    	// main processing - most of the time is spent here
	    	giveToCpu(8,
	    			  cin, lenP2,
	    			  0, blocksize,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(4,
	    			  cin, lenP2,
	    			  blocksize, blocksize << 1,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(2,
	    			  cin, lenP2,
	    			  blocksize<<1, (blocksize << 1) + blocksize,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);
	    	giveToCpu(1,
	    			  cin, lenP2,
	    			  (blocksize << 1) + blocksize, length,
	    			  0, 1,
	    			  twiddles, 0, 1 << (lenP2-1),
	    			  normalise, arrange,
	    			  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// combine results
	    	giveToCpu(8,
	    			  use, lenP2,
	    			  0, blocksize,
	    			  blocksize>>1, lenP2 - 2,
	    			  twiddles, 0, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(4,
	    			  use, lenP2,
	    			  blocksize>>1, (blocksize >> 1) + blocksize,
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, blocksize, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(2,
	    			  use, lenP2,
	    			  blocksize<<1, 3*blocksize,
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, 0, 2,
	    			  normalise, 0,
	    			  realonly, 0);
	    	giveToCpu(1,
	    			  use, lenP2,
	    			  (blocksize <<1) + (blocksize >> 1), 3 * blocksize + (blocksize >> 1),
	    			  blocksize >> 1, lenP2 -2,
	    			  twiddles, blocksize, 2,
	    			  normalise, 0,
	    			  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// combine again
	        giveToCpu(8,
	        		  use, lenP2,
	        		  0, blocksize,
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, 0, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(4,
	        		  use, lenP2,
	        		  blocksize >> 1,(blocksize >> 1) + blocksize,
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, blocksize>>1, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(2,
	        		  use, lenP2,
	        		  blocksize,(blocksize<<1),
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, blocksize, 1,
	        		  normalise, 0,
	        		  realonly, 0);
	        giveToCpu(1,
	        		  use, lenP2 ,
	        		  blocksize + (blocksize >> 1),(blocksize << 1) + (blocksize >>1),
	        		  3*(blocksize >> 1), lenP2-2,
	        		  twiddles, 3*(blocksize >> 1), 1,
	        		  normalise, 0,
	        		  realonly, 0);

			// MUST WAIT FOR THESE TO HAVE FINISHED
			fftWait(0xf);			// wait for all to finish

			// complete the real only processing
			if(realonly)
			{
				giveToCpu(8,
						  use, lenP2,
					      0, blocksize,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(4,
						  use, lenP2,
					      blocksize, blocksize << 1,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(2,
						  use, lenP2,
					      blocksize<<1, 3*blocksize,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				giveToCpu(1,
						  use, lenP2 ,
					      3*blocksize,blocksize << 2,
					      0, 1,
					      twiddles, 0, 2,
					      normalise, 0,
					      realonly, 1);

				// MUST WAIT FOR THESE TO HAVE FINISHED
				fftWait(first|second);			// wait for both to finish
			}	// if

			break;

	}	// switch

}	// fftMulti()


// see fft.h for description
void giveToCpu(const int which,								// 1,2,4 or 8
			   fft_cpx* cin, const int lenP2,				// input array
			   const int begin, const int nextbegin,		// part indices
			   const int oddadd, const int startP2,
			   const fft_cpx* twiddles,
			   const int toffset, const int twinc,
			   const int normalise, const int arrange,
			   const int realonly, const int secondpass)
{
	if(0 == which) return;					// nothing to do this time


	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\ngiveToCpu(): Invalid power of 2 for array length: %d, should be from %d to %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		return;
	}	// if

	// get the index into the FFT_FLAGS
    int cpuindex;
    if(1 == which) cpuindex = 0;
    else if(2 == which) cpuindex = 1;
    else if(4 == which) cpuindex = 2;
    else if(8 == which) cpuindex = 3;

    else return;

	volatile fft_cpu *thisCpu = (void *)(FFT_FLAGS + cpuindex*sizeof(fft_cpu));

	thisCpu->cinadrs	= (unsigned int)cin;
	thisCpu->lenP2		= (unsigned int)lenP2;
	thisCpu->begin		= (unsigned int)begin;
	thisCpu->nextbegin	= (unsigned int)nextbegin;
	thisCpu->oddadd		= (unsigned int)oddadd;
	thisCpu->startP2	= (unsigned int)startP2;
	thisCpu->twadrs 	= (unsigned int)twiddles;
	thisCpu->toffset	= (unsigned int)toffset;
	thisCpu->tinc		= (unsigned int)twinc;
	thisCpu->normalise 	= (unsigned int)normalise;
	thisCpu->arrange 	= (unsigned int)arrange;
	thisCpu->realonly   = (unsigned int)realonly;
	thisCpu->secondpass = (unsigned int)secondpass;

	thisCpu->spare2		= 0xa5a5a5a5;					// for debug
	thisCpu->status		= (unsigned int)FFT_START;		// do this last

    // if the CPU is the one running this code, do the FFT here
    if(START_CPU_INDEX == cpuindex)
    {
    	thisCpu->status |= FFT_START;
		thisCpu->spare1 = 0xbabecafe;				// for debug

    	fftProc((void *)thisCpu->cinadrs, thisCpu->lenP2,
				thisCpu->begin, thisCpu->nextbegin,
				thisCpu->oddadd, thisCpu->startP2,
				(void *)thisCpu->twadrs,
				thisCpu->toffset, thisCpu->tinc,
				thisCpu->normalise, thisCpu->arrange,
				thisCpu->realonly, thisCpu->secondpass);

    	thisCpu->spare2 = 0xcafebabe;				// for debug
		thisCpu->status &= ~FFT_WORK_MASK;			// finished

    } else {
    	*((int *)IRQMP_ADRS) = 0x380b0000 | which;
    }	// else

}	// giveToCpu()


// see fft.h for description
void fftWait(int whichcpus){

	// other CPUS loop forever, check for work, do it, set done status

	volatile fft_cpu *cpu0 = (void *)(FFT_FLAGS);
	volatile fft_cpu *cpu1 = (void *)(FFT_FLAGS + sizeof(fft_cpu));
	volatile fft_cpu *cpu2 = (void *)(FFT_FLAGS + 2*sizeof(fft_cpu));
	volatile fft_cpu *cpu3 = (void *)(FFT_FLAGS + 3*sizeof(fft_cpu));

//	volatile fft_cpu * cpus = (void *)FFT_FLAGS;
	int notdone = 0;
	do
	{
		notdone =  (8 & whichcpus)? FFT_WORK_MASK & cpu3->status : 0;
		notdone |= (4 & whichcpus)? FFT_WORK_MASK & cpu2->status : 0;
		notdone |= (2 & whichcpus)? FFT_WORK_MASK & cpu1->status: 0;
		notdone |= (1 & whichcpus)? FFT_WORK_MASK & cpu0->status: 0;

	} while (notdone);

	return;

}	// fftWait()

// see fft.h for description
void fftRealReturn(fft_cpx * cin, const int halflenP2, fft_cpx * twiddles)
{
	fft_scalar value;
	int i,j;

	int length = 1 << halflenP2;

	// results overwrite originals that are needed subsequently,
	// so copy originals into second half of underlying array
	j = length;
	for(i =0; i <= length; i++){
		cin[j].re 	 = cin[i].re;
		cin[j++].im  = cin[i].im;
	}	// for

	j = (length << 1) +1;
	for(i = 0; i < length; i++)
	{
		j--;
		value =  (cin[i].re + cin[j].re
			      + twiddles[i].re*(cin[i].im + cin[j].im)
			      + twiddles[i].im*(cin[i].re - cin[j].re))/2.0;

		cin[i].im = (cin[i].im - cin[j].im
			         + twiddles[i].im*(cin[i].im + cin[j].im)
			         - twiddles[i].re*(cin[i].re - cin[j].re))/2.0;

		cin[i].re = value;

	}	// for

	// Nyquist case
	// if they existed, twiddles[length].re would be -1, twiddles[length].im would be 0;
	cin[length].re = cin[length].re - cin[length].im;
	cin[length].im = 0;

	// copy positive frequency  results to negative, doing complex conjugate
	for(i = 1; i < length ; i++)
 	{
 		cin[length+i].re = cin[length-i].re;
 		cin[length+i].im = -cin[length-i].im;
 	}	// for

	return;
}	// fftRealReturn()


/*
 * UTILITIES
 */

// see fft.h for description
int getNumCpus(const int whichcpus)
{
	// set up count of cpus to use
	int numcpus = 0;				// how many cpus to use

	numcpus += (whichcpus & 1)? 1 : 0;
	numcpus += (whichcpus & 2)? 1 : 0;
	numcpus += (whichcpus & 4)? 1 : 0;
	numcpus += (whichcpus & 8)? 1 : 0;

    return numcpus;

}	// getNumCpus


// see fft.h for description
int settingsIncorrect(const int lenP2, int *cpusrequest, const int numruns, const int type, int* lines)
{
	int result = 0;
	// array size is limited by the way memory is being allocated at present
	if((FFT_MIN_LENP2 > lenP2)||(FFT_MAX_LENP2 < lenP2))
	{
		printf("\nInvalid power of 2 for array length: %d, should be between %d and %d\n",
				lenP2, FFT_MIN_LENP2, FFT_MAX_LENP2);
		result |= 1;
	}	// if

	int whichcpus = *cpusrequest;
	if((0 == whichcpus)||(whichcpus & ~CPUS_MASK))
	{
		printf("\nInvalid choice of CPUs: 0x%x, must be from 1 to 0x%x\n",
				whichcpus, CPUS_MASK);
		result |= 1;
	}	// if

	// get the count of cpus being requested
	int numcpus = getNumCpus(whichcpus);

	// only allow 1, 2 or 4 cpus, if 3 deselect the lowest order one
	if(3 == numcpus)
	{
	    if (whichcpus & 1) whichcpus &= 0xe;		// deselect CPU 0
	    else whichcpus &= 0xd;						// deselect CPU 1
	    *cpusrequest = whichcpus;
	    numcpus = 2;
	    printf("\nChoice of 3 CPUs invalid, will use these 2 CPUs : 0x%x\n", whichcpus);
	}	// if

	// find how many CPUs are present on system
	volatile int status = *(int *)IRQMP_ADRS;		// current system configuration
	int statuscpus = (((status & CPUS_PM1_MASK)>>CPUS_PM1_SHIFTS)&0xf) + 1;
	printf("\nMain, Status: 0x%x, CPUs present %d,  CPUs requested: %d, WhichCPUs: 0x%x\n",
			status,statuscpus,numcpus,whichcpus);	// debug

	if((0 == numcpus)||(numcpus > statuscpus))		// sanity check
	{
		printf("Abandoning, number of CPUs requested: %d, number present %d",numcpus, statuscpus);
		result |= 1;
	}	// if

	// check the type
	if((0 != type)&&(1 != type))
	{
		printf("Abandoning, invalid type input: %d, should be 0 for forward FFT or 1 for inverse FFT\n",type);
		result |= 1;
	}	// if

	// check the choice of number of lines to display
	if(0 >= *lines)
	{
		printf("Number of lines chosen was %d, no data or result will be displayed\n",*lines);
	} else if ((1 << lenP2) < *lines){
		printf("Number of lines chosen %d exceeds array length %d, array length will be used\n",*lines,1<<lenP2);
		*lines = 1 << lenP2;
	}
	return result;

}	// settingsIncorrect()


// see fft.h for description
void showC(const fft_cpx* in, const int length, const double tolerance)
{
	int i=0;
	float temp1 = 0.0;
	float temp2 = 0.0;
	printf("Complex numbers array, showing first %d elements\n", length);

	for (i=0; i<length; i++) {

	    temp1 = in[i].re;
	    if(  ((0.0 < temp1)&&( tolerance > temp1))
	       ||((0.0 > temp1)&&(-tolerance < temp1))){
	   	   temp1 = 0.0;
	    }	// if

	    temp2 = in[i].im;
	    if(  ((0 < temp2)&&( tolerance > temp2))
	       ||((0 > temp2)&&(-tolerance < temp2))){
	  	   temp2 = 0.0;
	    }	// if

	    printf("%d:   %g      %g\n", i, temp1, temp2);
	}	// for

}	// showC()


// see fft.h for description
void show8C(const fft_cpx* in, const int length, const double tolerance, const int realonly)
{
	if(32 > length)
	{
		showC(in,length,tolerance);
	}	// if

	int i,k;
	fft_cpx temp;
	float temp1 = 0.0;
	float temp2 = 0.0;
	printf("Complex numbers array, showing certain elements\n\n");

	int nyindex = length>>1;
	int begins[4] = {0, nyindex - 8, nyindex, length-8};

	for(k =0; k<4; k++)
	{
		for (i=0; i<8; i++) {

			temp  = in[begins[k]+i];
			temp1 = temp.re;
			if(  ((0.0 < temp1)&&( tolerance > temp1))
			   ||((0.0 > temp1)&&(-tolerance < temp1))){
		   	   temp1 = 0.0;
			}	// if

			temp2 = temp.im;
			if(  ((0 < temp2)&&( tolerance > temp2))
		       ||((0 > temp2)&&(-tolerance < temp2))){
		  	   temp2 = 0.0;
		    }	// if

		    printf("%d:\t\t\t%g      %g\n", begins[k]+i, temp1, temp2);

		    if((0 == begins[k]+i)||(nyindex == begins[k]+i)) printf("\n");

		    if(realonly && (nyindex == begins[k] + i))
		    {
		    	printf("In-phase data only, last frequency shown above is Nyquist\n");
		    	printf("Negative frequencies not shown (complex conjugates of positive frequencies)\n\n");
		    	break;	// for loop
		    }	// if

		}	// for

		printf("\n");
		if(realonly && (2 == k)) break;

	}	// for

}	// show8C()