Fft.h

From wiki
Revision as of 16:53, 14 September 2020 by imported>Bkavanagh (Created page with " →‎* fft.h * * Created on: Jan 21, 2016 * Author: mryan: #ifndef FFT_H_ #define FFT_H_ #include <math.h> #include <time.h> #include <unistd.h> // assu...")
(diff) ← Older revision | Latest revision (diff) | Newer revision → (diff)
Jump to navigation Jump to search
/*
 * fft.h
 *
 *  Created on: Jan 21, 2016
 *  Author: mryan
 */
#ifndef FFT_H_
#define FFT_H_

#include <math.h>
#include <time.h>
#include <unistd.h>

// assuming basic number type is float (for now)
# ifndef fft_scalar
#   define fft_scalar float
# endif

// (in-phase, quadrature) data is treated as a complex number
typedef struct {
    fft_scalar re;					// real, i.e. in-phase
    fft_scalar im;					// imaginary, i.e. quadrature
}fft_cpx;

// each CPU has a struct of this type set up in SRAM
typedef struct {
    unsigned int status;
    unsigned int cinadrs;
    unsigned int lenP2;				// power of 2 that gives length
    unsigned int begin;
    unsigned int nextbegin;
    unsigned int oddadd;
    unsigned int startP2;			// power of 2 that gives butterfly to start with
    unsigned int twadrs;
    unsigned int toffset;
    unsigned int tinc;
    unsigned int normalise;
    unsigned int arrange;
    unsigned int realonly;
    unsigned int secondpass;
    unsigned int spare1;			// bring up to a 16 word boundary
    unsigned int spare2;
}fft_cpu;

#define FFT_WORK_MASK   1			// pick off last bit
#define FFT_NOWORK		0			// nothing to be done/finished
#define FFT_START		1			// work to be done/working

#define MAX_CPUS 		4
#define LAST_CPU		8			// 0x1000
#define START_CPU_INDEX	0			// N.B. fft_multi() assumes is 0, CPU that starts on power up

#define IRQMP_ADRS		0x80000210  // address of multiprocessor status register
#define CPUS_PM1_MASK	0xf0000000	// tells how many CPUs in system minus 1
#define CPUS_PM1_SHIFTS 28			// shift them to low order
#define	CPUS_MASK    	0xf			// bits used to power-up CPUs
#define CPU_START_PAUSE	1000		// pause between and after CPU startups

// set up space for the various arrays. S698PM SRAM is 32 Mbyte from 0x40000000.
#define FFT_SRAM 	 	0x60000000				// half way
#define FFT_FLAGS	 	FFT_SRAM+0x80000
#define FFT_CIN  	 	FFT_SRAM+0x100000		// space for 2 power 20 bytes,
#define FFT_RAIN	 	FFT_SRAM+0x200000		// i.e. 2 power 18 doubles
#define FFT_TWIDDLES 	FFT_SRAM+0x400000		// i.e. 2 power 17 complex nos
#define FFT_COUT	 	FFT_SRAM+0x600000
#define FFT_MAP			FFT_SRAM+0x800000		// maps input index to rearranged index

#define FFT_MIN_LENP2	5			// minimum input 32 (in-phase,quadrature)
#define FFT_MAX_LENP2	17			// for consistency with space allocation above

/*
 * FUNCTION DECLARATIONS
 */

/*
 *  The functions declared below are used to create an FFT
 *  using 1, 2 or 4 processors acting in parallel.
 *
 *	At present the input array length must be a power of 2.
 *
 *  At present the input data in each (in-phase, quadrature) pair must be C floats.
 *
 *	The processing is based on arrays of complex numbers corresponding to
 *	the (in-phase, quadrature) data.
 *
 *	If the input data is in-phase only, the input array will have 0 in each entry's
 *	quadrature component. An option is provided to allow this be exploited so as
 *	to get the result roughly twice as fast
 *
 *  Usage:
 *		0.	Create an array holding the in-phase and quadrature data
 *			as complex numbers (fft_cpx).
 *
 *			At present, the length of this array must be a power of 2.
 *			At present, data elements should be of the C float type.
 *
 *		1.	The complex array must be arranged in an order suitable
 *			for 'discrimination in time' (DIT) processing.
 *
 *			This can be done as the entries are made one at a time.
 *			An index array can first be created for use to map
 *			successive input to the correct position in the
 *			array. This can be stored in ROM, and reused to allocate data
 *			as it comes in to the appropriate place in the array.
 *
 *			The function fftArrangeIndex() below is provided for
 *			this purpose.
 *
 *			If the input data is not rearranged as it arrives,
 *			a parameter can be set to cause rearrangement to be done
 *			in parallel in the first phase of FFT processing on each CPU.
 *			In that case the FFT result does not overwrite the input data,
 *			but can be found in the FFT_RAIN area defined below.
 *
 *			Alternatively, if the array already exists (as an array of complex numbers),
 *			it can be rearranged in situ using fftArrangeIn(),
 *			or as a new complex array using fftArrangOut().
 *
 *			The data in the rearranged array is overwritten by the FFT
 *			results, so if it is desired to keep the original data
 *			either the parallel rearrangement parameter should be set
 *			or the fftArrangeOut() function should be used.
 *
 *			To avoid the overhead of rearrangement it is best to rearrange
 *			the data as it arrives, if possible.
 *
 *		3.	The 'twiddles' array, containing complex roots of unity,
 *			must be created. This is half the length of the input
 *			array of complex numbers, as due to symmetry only half
 *			of the roots need to be created and stored.
 *			The function fillTwiddles() is provided for this purpose.
 *
 *		N.B.	If the size of the data array does not change,
 *				the twiddles array can be stored in ROM.
 *				Two versions are needed in that case, one for the forward FFT,
 *				one for the inverse FFT. Both can be generated using fillTwiddles().
 *
 *		4.	The appropriately arranged complex input array is passed to fftMulti(),
 *			together with the twiddles array and which processors to use.
 *			The result overwrites the original data in the rearranged array.
 *
 *		5.	Other than input rearrangement, no copying is involved in passing data
 *			to the different processors, each is given a reference to a contiguous
 *			portion of the input array.
 *			A reference to the twiddles array is also passed, though the entries
 *			used in this by a processor are not contiguous.
 *
 *		6.	All stages of processing up to the last two (4 CPUs) or last one (2 CPUs)
 *			are done independently on the CPUs, with no interaction between them.
 *			The last stage or stages require results generated by a different CPU,
 *			so must wait for the previous stages to complete before progressing.
 *
 *		7.	If the 'real only', i.e in-phase only, option is selected any 'imaginary',
 *			i.e quadrature, component in the input is ignored and the input array compressed
 *			into half its length with the entries in the remaining half set to (0.0,0.0).
 *			The processing is done roughly twice as fast.
 *			Only the positive frequency components are produced,
 *			the negative frequency components are easily derived if needed by taking
 *			the complex conjugates of the positive frequency components,
 *			i.e. changing the sign of the imaginary component and leave the rest the same.
 */

/*
 * FUNCTION DECLARATIONS
 */
/*
 * This ASM-function returns the CPU index (0 to 3) of the current S698PM CPU.
 */
 static inline unsigned int get_asr17(void);


/*
 * Starting CPUs
 *
 * All CPUs have the same entry point.
 * The linker defaults to 0x40000000.
 * DMON's ep command is used to set each CPU's initial PC to this.
 *
 * This function is entered immediately on entering main().
 * It determines the index (0 to 3) of the CPU running the code.
 * If this is the START_CPU_INDEX it initializes the data areas used by all CPUs
 * and returns to main().
 * Otherwise, it checks to see if any work is waiting, does it,
 * and halts the CPU. When work becomes available, the CPU is restarted,
 * does the work, and halts again. This function never returns if running on
 * these CPUs.
 *
 */
int fftCpusStart(void);


/*
 * Preliminary start of the selected CPUs if other than the one running this.
 * Each such CPU starts, runs through its initialization code, and powers down
 * Each CPU shares the same initialization code, so there is a pause between
 * each startup just in case ...
 *
 * A CPU is restarted when there is work for it to do
 * (restarts are done directly, not by this).
 *
 *  Inputs:
 *  	const int		which cpus		from 0 to 0xf, the current CPU, if selected, is ignored
 *
 *  Returns:							None
 *
 * 						N.B. This routine should not be used to restart CPUs
 */
void startCpus(const int whichcpus);


/*
 * INPUT ARRAY REARRANGEMENT
 *
 * This is used with a discrimination in time (DIT) implementation of the FFT.
 * Separate parts of the rearranged array can be processed independently
 * on different CPUs without any interaction between the CPUs being required.
 * The last two stages (4 CPUS) or just the last stage (2 CPUS)can only be done when
 * the previous stages are finished, and are again distributed across all CPUs.
 *
 * The best approach to input rearrangement is to do it as the data arrives.
 *
 * Alternatively, by setting a parameter rearrangement can be done in parallel
 * one each CPU.
 *
 * Three very similar functions, not parallel, are given to allow a complete input
 * array be rearranged:
 * 	fftArrangeOut		an out of place rearrange, the input array is unchanged
 * 	fftArrangeIn		an in place rearrange, the input array is rearranged in situ
 * 	fftaArrangeIndex	creates an array of indexes. Entry [i] gives the index where
 * 						the (in-phase, quadrature) complex pair should be stored when
 * 						data is read one pair at a time
 *						- the input array will then not need further rearrangement
 *
 * All three functions assume the array length is a power of 2, and this power is what
 * is input to give the array length.
 *
 * (Thanks for bit-swap method used to graphics.stanford.edu/~seander/bithacks.html#ReverseParallel)
 *
 */

/*
 *	Out of place rearrangement of input array to prepare for FFT.
 *	Assumes array length is a power of 2.
 *
 *	Inputs:
 *		fft_cpx* 		cin		pointer to an array of complex numbers
 *								(the length of this array must be a power of 2)
 *		int				lenP2	log to base 2 of array length. 0 <= lenP2 <= 30.
 *
 *		fft_cpx* 		result	pointer to resulting array of complex numbers
 *								(same length as cin array)
 *
 *	Result:				A rearranged version of cin input array in the result array
 *
 *	N.B.				Space for the result array of complex numbers must be allocated
 *						before calling this function, same length as the cin input array.
 *
 */
void fftArrangeOut(fft_cpx* cin, fft_cpx* result, int lenP2);


 /*
  *	In place rearrangement of input array to prepare for FFT.
  *	Assumes array length is a power of 2.
  *
  *	Inputs:
  *		fft_cpx* 		cin		pointer to an array of complex numbers
  *								(the length of this array must be a power of 2)
  *		int				lenP2	log to base 2 of array length.
  *
  *	Result:				A rearranged version of cin input array in the same space
  *
  */
 void fftArrangeIn(fft_cpx* x, int lenP2);


 /*
  *	Generate an array of indices giving the array position where each successive input
  *	should be placed in the input array.
  *	This array could be stored in ROM and used when inputting data so as to avoid
  *	any subsequent need to spend time rearranging the input array.
  *
  *	Assumes amount of input datais a power of 2.
  *
  *	Inputs:
  *		int * 			mapindex	pointer to an array of ints that will hold the index to be used
  *									in the complex input array for each successive data input.
  *									(the length of this array must be a power of 2
  *		int				lenP2		log to base 2 of array length. 0 <= lenP2 <= 30
  *									(regarding the array as an array of complex numbers)
  *
  *	Result:				mapindex will hold appropriate index in data array to use to hold complex input i
  *
  *	N.B. if the input data is an array of scalars, the length of mapindex[] will be half of the length
  *		 of the array to hold the scalars, and the input index k used to look up the mapindex array should only
  *	     increment after each pair of reals have been added to dataarray[mapindex[k]<< 1] and dataarray[(mapindex[k]<< 1) +1]
  *	     (they will be treated as the real and imaginary parts of the complex number at the index given by mapindex[k])
  *
  *	N.B. Space for the mapindex array of ints must be allocated before calling this function,
  *		 same length as complex data input array, half length of real data input arrays.
  *
  *	N.B. The mapping array is its own inverse
  *
  */
 void fftArrangeIndex(int* mapindex, int lenP2);


 /*
   *	If the 'realonly' option is selected, the input data is treated as having in-phase components only
   *	and any quadrature component is ignored and will be overwritten in processing.
   *	The 'real', i.e. in-phase input components are rearranged into half the number of inputs
   *	with every second in-phase input now in the quadrature position, and zero in remaining half of the input array.
   *	This allows faster FFT processing but requires post-processing.
   *	(Post processing is distributed across all processors and done later in a final processing phase.
   *	 It produces the positive frequency components only, the negative components can be obtained by taking
   *	 the complex conjugates of these.)
   *
   *	Assumes amount of input data is a power of 2.
   *
   *	Inputs:
   *		fft_cpx * 		cin			input array with all 0s in quadrature (imaginary) positions.
   *									Altered so that quadrature positions hold alternate real inputs,
   *									and remaining array entries are made (0.0,0.0)
   *		int				lenP2		log to base 2 of array length. 0 <= lenP2 <= 30.
   *									(regarding the array as an array of complex numbers)
   *
   *	Result:				first half of cin holds rearranged data, rest is (0.0,0.0)
   *
   *						N.B. 		use of this function can usually be avoided by simply casting an input
   *									array of scalars to be an array of complex numbers.
   */
 void fftRealOnlyArrange(fft_cpx * cin, const int lenP2);


 /*
  * TWIDDLES ARRAY
  */

 /*
  * For an input data array of length N the 'twiddle' factors are the N complex roots
  * of unity. Only N/2 of these are calculated, as the others are just minus these,
  * and are provided by using subtraction rather than addition.
  *
  * Getting the complex roots of unity is time consuming, so where the size of
  * array to be processed is known in advance the twiddles array should be stored
  * in ROM. Two arrays should be stored, one for the forward FFT, one for the inverse.
  * Each has length N/2, half that of the complex data array.
  *
  * Time consuming calculations of sines and cosines are involved, but in some cases
  * can be avoided by exploiting various symmetries. These require the complex data array
  * size to be at least 8, but it is restricted here to at least 1 << FFT_MIN_LENP2, or 32 at present.
  *
  *
  * Inputs:
  *		fft_cpx* 		twiddles	pointer to an array of complex numbers
  *									(the length of this array will be half the length
  *									 of the complex data array with which the twiddles
  *									 array will be used)
  *		const int		lenP2		log to base 2 of complex data array length.
  *									(from FFT_MIN_LENP2	to FFT_MAX_LENP2, i.e. 5 to	17)
  *
  *		const int		type 		0 for FFT, 1 for inverse FFT
  *
  * Result:				N/2 of the N complex roots of unity in twiddles array
  *
  * N.B.				Space for the twiddles array of N/2 complex numbers must be allocated
  *						before calling this function, where N is the length of the complex data
  *						array. i.e. 1 << lenP2.
  *
  */
 void fillTwiddles(fft_cpx* twiddles, const int lenP2, const int type);


 /*
  * FFT PROCESSING
  *
  * The (in-phase, quadrature) input data is regarded as an array of complex numbers.
  * The resulting FFT is also an array of complex numbers.
  *
  * If the data is not already in rearranged order, by setting the 'arrange' parameter
  * the input array can be rearranged in parallel, with each CPU doing its own section.
  * This involves copying the original input data into the area at FFT_RAIN,
  * where it will eventually be overwritten by the FFT result, with the original
  * input data left unchanged.
  *
  */

 /*
  * This function calculates all or part of the FFT.
  * It can be used on separate processors or in separate threads.
  *
  * It processes part or all of a complex data array, replacing parts of the
  * array with appropriate FFT data. After any initial rearrangement
  * all processing is done in situ .
  *
  * The length of a sub-array must be a power of 2 and no less than 8.
  * The sub-array can be the whole array.
  *
  * The size of the butterfly blocks with which to start processing a sub-array is
  * input as a power of 2.
  *
  * Where the blocks in the subarray have not already been processed,
  * the starting power of 2 is 1.
  *
  * If parts of a subarray have already been processed, the correct butterfly
  * step must be given. All parts of the subarray must have the same size.
  *
  * The post-processing phrase needed when the original data is in-phase only
  * is done if 'secondpass' is set.
  *
  * Rearrangement of the data for this section is done if the 'arrange' is set.
  *
  *
  * Inputs:
  *		fft_cpx* 		arranged	pointer to the array of complex numbers that
  *									initially contains the rearranged complex data input.
  *									Will hold result.
  *		const int		lenP2		Power of 2 that gives length of input array.
  *
  *		const int		start		position in array at which sub-array starts
  *		const int		nextstart	position one after end of data to process
  *									N.B. (nextstart - start) must be a power of 2
  *
  *		const int		oddadd		Added internally to half of sub-array size.
  *									Used to get distance from even to odd entries.
  *
  *		const int		startP2		power of 2 that gives butterfly at which to start.
  *									Must be at least 1.
  *									1 if no previous processing done on this sub-array.
  *									Otherwise gives size of butterfly to start with.
  *
  *		const fft_cpx*	twiddles	pointer to twiddles array - this must correspond
  *									to the original complete complex input array.
  *		const int		toffset		initial offset into twiddles array
  *		const int		twinc		initial increment to using going through twiddles array
  *
  *		const int		normalise	0 for don't normalize, anything else -> normalize
  *
  *		const int		arrange		0 for don't arrange, anything else -> arrange
  *
  *		const int		realonly	0 if original data complex, 1 if in-phase only
  *									and has been rearranged
  *
  *		const int		secondpass	0 if not ready to do post-processing,
  *									1 to do post-processing only
  *
  * Result:				FFT processed values in situ in this subarray
  *
  *	N.B.				If 'realonly' is selected, assumes that the input array is a compressed
  *						version of something twice as long, and that this additional space
  *						is available.
  *
  *	N.B.				If 'realonly' is selected, FFT result contains the positive frequencies
  *						only. Negative frequencies are easily derived if needed as the complex
  *						conjugates of these.
  *
  */
 void fftProc(fft_cpx* arranged, const int lenP2,
 			 const int start, const int nextstart,
 			 const int oddadd, const int startP2,
     		 const fft_cpx* twiddles,
     		 const int toffset, const int twinc,
     		 const int normalise, const int arrange,
     		 const int realonly, const int secondpass);


 /*
  * Allocates work of doing FFT to one or more CPUs
  * and combines results from them to give FFT.
  *
  * The input is an array of complex numbers representing the
  * (in-phase,quadrature) data.
  * Its length must be a power of 2 - this power rather than the
  * length is input.
  *
  * The complex number array giving the FFT is returned in situ
  * in the input array, unless the 'arrange parameter is set,
  * in which case the result is in the area at FFT_RAIN.
  *
  * At present the number of CPUs to use must be 1, 2 or 4.
  *
  * Inputs:
  *		fft_cpx* 		arranged	pointer to an array of complex numbers that
  *									holds the rearranged complex data input.
  *									Will hold result.
  *		const int		lenP2		power of 2 that gives size of array
  *
  *		const fft_cpx*	twiddles	pointer to twiddles array
  *									- this must correspond to input array
  *									  (it will be half its size)
  *
  *		const int		whichcpus	1 to 0xf, which CPUs to use (binary xxxx, x=1 for use)
  *
  *		const int		normalise	0 for don't normalize, anything else -> normalize
  *
  *		const int		arrange		0 for don't arrange, anything else -> arrange
  *
  *		const int		realonly	0 if original data has both components, 1 if in-phase only
  *									and has been rearranged
  *
  * Result:				Complex valued FFT in situ in arranged
  *
  * N.B.				Assumes START_CPU_INDEX is 0
  *
  */
 void fftMulti(fft_cpx* cin, const int lenP2,
 			  const fft_cpx* twiddles,
     		  const int whichcpus,
     		  const int normalise,
     		  const int arrange,
     		  const int realonly);


 /*
  * Gives one part of the FFT calculation to the selected CPU for execution.
  *
  * The input is part of an array of complex numbers.
  * Its length must be a power of 2 - this power rather than the
  * length is input.
  *
  * The output is placed in the appropriate locations in the input array,
  * unless the 'arrange' parameter is set, in which case the output is
  * in the area at FFT_RAIN.
  *
  * At present the identifier of the CPU to use must be 1, 2, 4 or 8, anything
  * else the function returns without doing anything.
  *
  * Inputs:
  *		const int		which		which CPU to use - must be 1, 2, 4 or 8
  *
  *		fft_cpx* 		arranged	pointer to an array of complex numbers that
  *									holds the rearranged complex data input.
  *									Will hold result.
  *		const int		lenP2		power of 2 that gives size of array
  *
  *		const int		begin		index into arranged, location where this processing is to start
  *
  *		const int		nextbegin	index into arranged, begin <= index < nextbegin
  *
  *		const int		oddadd		Added internally to half of sub-array size.
  *									Used to get distance from even to odd entries.
  * 	const int		startP2		1 << (lenP2 - startP2) gives initial step size in twiddles array
  *
  *		const fft_cpx*	twiddles	pointer to twiddles array
  *									- this must correspond to input array
  *									  (it will be half its size)
  *
  *  	const int		toffset		initial offset into twiddles array
  *		const int		twinc		initial increment to using going through twiddles array
  *
  *		const int		normalise	0 for don't normalize, anything else -> normalize
  *
  *		const int		arrange		0 for don't arrange, anything else -> arrange
  *
  * 	const int		realonly	0 if original data complex, 1 if in-phase only
  *									and has been rearranged
  *
  * Result:				Updated complex values for this stage of FFT processing in arranged
  *
  */
 void giveToCpu(const int which,							// 1,2,4 or 8
 			   fft_cpx* cin, const int lenP2,				// input array
 			   const int begin, const int nextbegin,		// part indices
 			   const int oddadd, const int startP2,
 			   const fft_cpx* twiddles,
 			   const int toffset, const int twinc,
 			   const int normalise, const int arrange,
 			   const int realonly, const int secondpass);


 /*
  * Wait for selected CPUs to have finished.
  * Returns when no CPU has work pending.
  * Can wait indefinitely. TODO put in a time check
  *
  * All stages of the FFT except at most the last two (for the S698PM) proceed independently
  * with no need for communication between CPUs.
  *
  * If using 4 CPUs, each of the last two stages must wait for all CPUs to have finished
  * the preceding stage.
  *
  * If using 2 CPUs, they must wait to process the last stage until both CPUs have finished
  * the earlier stages.
  *
  * Inputs:
  *		const int		whichcpus	which CPUs to check - xxxx, 1 to check, 0 not to check
  *
  *
  * Result:				Wait, only return when no selected CPU has work pending
  */
 void fftWait();


/*
 * UTILITIES
 */

 /*
  * Get the number of CPUs enabled by the selections in whichcpus.
  *
  * 	Input:
  * 		const int 	whichcpus	0 to 0xf  (binary xxxx, 1 = use CPU
  *
  * 	Output:
  * 		int			Count of currently selected CPUs ( 0 to 4)
  *
  */
 int getNumCpus(const int whichcpus);


 /*
  * Validate the input settings and correct if possible
  *
  *	Inputs:
  *		const int		lenP2			must be from FFT_MIN_LENP2	to FFT_MAX_LENP2, (i.e. 5 to 17)
  *
  *		int *			cpusrequest		should be from 0 to 0xf, corresponding to 1, 2 or 4 cpus (but not 3)
  *										(correction attempt made if three CPUs chosen)
  *
  *		const int		numruns			no validation
  *
  *		const int		type			must be 0 or 1
  *
  *		int *			lines			corrected to 1 << lenP2 if greater than this
  *
  *	Returns:
  *		int				0 if all o.k., 1 if a problem (messages given),
  *
  * Inputs:
  *
  */
 int settingsIncorrect(const int lenP2, int *cpusrequest, const int numruns, const int type, int* lines);


 /*
  * Display an array of complex numbers on the console.
  * Can be used to display data input or FFT result.
  *
  * N.B. At present assumes floats being used
  *
  * Inputs:
  * 	fft_cpx* 		in			input array of complex numbers
  *
  *		const int		length		length of the array
  *
  *		const double	tolerance	absolute values less than this are shown as 0.0
  *
  *	Result:				Array is output to the console
  *
  */
 void showC(const fft_cpx* in, const int length, const double tolerance);


 /*
   * Display main parts of FFT result on the console.
   * Shows initial 8 positive frequency components including 0 component,
   * show 8 components before Nyquist frequency, 8 components from Nyquist,
   * and last 8 components.
   * If 'realonly' is selected, shows result up to Nyquist frequency only.
   *
   * N.B. At present assumes floats being used
   *
   * Inputs:
   * 		fft_cpx* 		in			input array of complex numbers
   *
   *		const int		length		length of the array
   *
   *		const double	tolerance	absolute values less than this are shown as 0.0
   *
   *		const int 		realonly	0 to show full FFT, 1 to show non-negative frequencies only.
   *
   *	Result:				Array is output to the console
   *
   */
 void show8C(const fft_cpx* in, const int length, const double tolerance, const int realonly);


#endif // FFT_H_