Performing an FFT is currently has the biggest impact on performance in my program. I'm using an algorithm I found, which I've already trimmed down quite a bit, but since it's called many times per second, any more performance I can squeeze out of it would be worthwhile.
If anyone can spot where there might be room for improvement, I would be grateful. I'm only interested in performance, not accuracy.
public static void fft(double[] input, double[] realOut, double[] imagOut)
{
int numSamples = input.length;
int i, j;
int numBits = Integer.numberOfTrailingZeros(numSamples);
for(i = 0; i < numSamples; i++)
{
j = reflect(i, numBits);
realOut[j] = input;
imagOut[j] = 0.0f;
}
int BlockEnd = 1;
int k, n;
double[] ar = new double[3];
double[] ai = new double[3];
double angle_numerator = 2.0 * Math.PI;
double tr, ti;
for(int BlockSize = 2; BlockSize <= numSamples; BlockSize <<= 1)
{
double delta_angle = -angle_numerator / (double)BlockSize;
double sm2 = Math.sin(2 * delta_angle);
double sm1 = Math.sin(delta_angle);
double cm2 = Math.cos(2 * delta_angle);
double cm1 = Math.cos(delta_angle);
double w = 2 * cm1;
for(i = 0; i < numSamples; i += BlockSize)
{
ar[2] = cm2;
ar[1] = cm1;
ai[2] = sm2;
ai[1] = sm1;
for(j = i, n = 0; n < BlockEnd; j++, n++)
{
ar[0] = w * ar[1] - ar[2];
ar[2] = ar[1];
ar[1] = ar[0];
ai[0] = w * ai[1] - ai[2];
ai[2] = ai[1];
ai[1] = ai[0];
k = j + BlockEnd;
tr = ar[0] * realOut[k] - ai[0] * imagOut[k];
ti = ar[0] * imagOut[k] + ai[0] * realOut[k];
realOut[k] = realOut[j] - tr;
imagOut[k] = imagOut[j] - ti;
realOut[j] += tr;
imagOut[j] += ti;
}
}
BlockEnd = BlockSize;
}
}