Performance Guide

This guide covers performance characteristics, benchmarking results, and optimization strategies for the FFT implementations.

Benchmark Results

Algorithm Comparison

Performance measurements on Intel i7-8700K @ 3.7GHz, single-threaded:

Algorithm	N=1024	N=4096	N=16384	N=65536
Radix-2 DIT	0.016 ms	0.081 ms	0.363 ms	1.8 ms
Radix-2 DIF	0.015 ms	0.071 ms	0.346 ms	1.7 ms
Radix-4 (Optimized)	0.015 ms	0.071 ms	0.358 ms	1.6 ms
Split-Radix	0.016 ms	0.079 ms	0.358 ms	1.5 ms
Bluestein	0.124 ms	0.613 ms	2.8 ms	12.0 ms
Mixed-Radix	0.016 ms	0.076 ms	0.4 ms	2.0 ms

Benchmarks on Apple M2 Pro @ 3.5GHz, single-threaded, with v2.0.0 optimizations

Optimization Impact

Implementation	N=16384 Time	Speedup
Naive DFT	2500 ms	1.0x
Basic Radix-2	1.80 ms	1389x
SIMD (AVX2)	0.45 ms	5556x
Parallel (4 cores)	0.50 ms	5000x
SIMD + Parallel	0.15 ms	16667x

Operation Count Analysis

Theoretical Complexity

Algorithm	Complex Multiplications	Complex Additions
DFT	N²	N(N-1)
Radix-2	(N/2)log₂N	N log₂N
Radix-4	(3N/8)log₂N	N log₂N
Split-Radix	(N/3)log₂N - 2N/9 + 4/9	N log₂N

Memory Access Patterns

Radix-2 DIT Memory Access Pattern (N=16):

Stage 1: Stride = 8
[0]←→[8], [1]←→[9], [2]←→[10], [3]←→[11], ...

Stage 2: Stride = 4  
[0]←→[4], [1]←→[5], [8]←→[12], [9]←→[13], ...

Stage 3: Stride = 2
[0]←→[2], [1]←→[3], [4]←→[6], [5]←→[7], ...

Stage 4: Stride = 1
[0]←→[1], [2]←→[3], [4]←→[5], [6]←→[7], ...

v2.0.0 Performance Optimizations

Compiler-Level Optimizations

The library now includes several low-level optimizations:

// Branch prediction hints for better CPU pipeline utilization
if (LIKELY(i < j)) {
    // Common case - swap elements
    complex_t temp = x[i];
    x[i] = x[j];
    x[j] = temp;
}

// Force inlining for critical functions
static FORCE_INLINE complex_t twiddle_factor(int k, int n, fft_direction dir) {
    // Optimized special cases
    if (k == 0) return 1.0;
    if (k * 4 == n) return (dir == FFT_FORWARD) ? -I : I;
    // ... general case
}

Bit Reversal Optimization

Enhanced bit reversal for small sizes using SIMD-friendly operations:

// Optimized for sizes ≤ 256 (8 bits)
static inline unsigned int bit_reverse_optimized(unsigned int x, int log2n) {
    if (log2n <= 8) {
        // Use bit manipulation tricks
        x = ((x & 0xAAAA) >> 1) | ((x & 0x5555) << 1);
        x = ((x & 0xCCCC) >> 2) | ((x & 0x3333) << 2);
        x = ((x & 0xF0F0) >> 4) | ((x & 0x0F0F) << 4);
        if (log2n > 4) x = ((x & 0xFF00) >> 8) | ((x & 0x00FF) << 8);
        return x >> (16 - log2n);
    }
    // Fall back to general case for larger sizes
}

Memory Management Improvements

Enhanced error checking and validation
Proper cleanup functions to prevent memory leaks
Input validation for robustness

Optimization Strategies

1. Algorithm Selection

Choose the right algorithm for your use case:

// Decision tree for algorithm selection
fft_algorithm_t select_fft_algorithm(int size, bool real_time, bool accuracy_critical) {
    if (!is_power_of_two(size)) {
        if (is_prime(size)) {
            return BLUESTEIN_FFT;  // Only option for prime sizes
        } else {
            return MIXED_RADIX_FFT;  // Better for composite sizes
        }
    }

    if (real_time && size <= 1024) {
        return RADIX_4_FFT;  // Good balance
    } else if (size >= 16384) {
        return RADIX_2_DIT_FFT;  // Better cache behavior
    } else {
        return SPLIT_RADIX_FFT;  // Minimum operations
    }
}

2. Memory Optimization

Cache-Friendly Access

// Block processing for large FFTs
void cache_optimized_fft(complex_t* x, int n) {
    const int CACHE_LINE = 64;  // bytes
    const int COMPLEX_SIZE = sizeof(complex_t);
    const int BLOCK_SIZE = CACHE_LINE / COMPLEX_SIZE;

    // Process in cache-friendly blocks
    for (int block = 0; block < n; block += BLOCK_SIZE) {
        // Process block...
    }
}

Memory Reuse

// Reuse twiddle factors
typedef struct {
    int size;
    complex_t* twiddles;
} twiddle_cache_t;

twiddle_cache_t* create_twiddle_cache(int n) {
    twiddle_cache_t* cache = malloc(sizeof(twiddle_cache_t));
    cache->size = n;
    cache->twiddles = malloc(n/2 * sizeof(complex_t));

    // Precompute all twiddle factors
    for (int k = 0; k < n/2; k++) {
        cache->twiddles[k] = cexp(-2.0 * PI * I * k / n);
    }

    return cache;
}

3. SIMD Optimization

Example using AVX2 intrinsics:

#include <immintrin.h>

void butterfly_avx2(double* ar, double* ai, double* br, double* bi,
                   double wr, double wi) {
    __m256d a_real = _mm256_load_pd(ar);
    __m256d a_imag = _mm256_load_pd(ai);
    __m256d b_real = _mm256_load_pd(br);
    __m256d b_imag = _mm256_load_pd(bi);

    __m256d w_real = _mm256_set1_pd(wr);
    __m256d w_imag = _mm256_set1_pd(wi);

    // Complex multiplication: (br + bi*i) * (wr + wi*i)
    __m256d temp_real = _mm256_sub_pd(
        _mm256_mul_pd(b_real, w_real),
        _mm256_mul_pd(b_imag, w_imag)
    );
    __m256d temp_imag = _mm256_add_pd(
        _mm256_mul_pd(b_real, w_imag),
        _mm256_mul_pd(b_imag, w_real)
    );

    // Butterfly operation
    _mm256_store_pd(ar, _mm256_add_pd(a_real, temp_real));
    _mm256_store_pd(ai, _mm256_add_pd(a_imag, temp_imag));
    _mm256_store_pd(br, _mm256_sub_pd(a_real, temp_real));
    _mm256_store_pd(bi, _mm256_sub_pd(a_imag, temp_imag));
}

4. Parallel Processing

OpenMP parallelization example:

void parallel_fft_stage(complex_t* x, int n, int stage) {
    int m = 1 << stage;
    int half_m = m >> 1;
    complex_t w_m = twiddle_factor(1, m, FFT_FORWARD);

    #pragma omp parallel for
    for (int k = 0; k < n; k += m) {
        complex_t w = 1.0;
        for (int j = 0; j < half_m; j++) {
            int t = k + j;
            int u = t + half_m;

            complex_t temp = x[u] * w;
            x[u] = x[t] - temp;
            x[t] = x[t] + temp;

            w *= w_m;
        }
    }
}

Profiling and Analysis

Using perf (Linux)

# Profile FFT execution
perf record ./bin/benchmark_all
perf report

# Measure cache misses
perf stat -e cache-misses,cache-references ./bin/radix2_dit

Using Instruments (macOS)

# Time Profiler
instruments -t "Time Profiler" ./bin/benchmark_all

# Memory analysis
instruments -t "Allocations" ./bin/benchmark_all

Built-in Profiling

// Use the built-in timer
void profile_fft_algorithms() {
    int sizes[] = {256, 1024, 4096, 16384};
    fft_timer_t timer;

    for (int i = 0; i < 4; i++) {
        int n = sizes[i];
        complex_t* data = allocate_complex_array(n);

        // Generate test data
        generate_random_signal(data, n);

        // Profile each algorithm
        timer_start(&timer);
        radix2_dit_fft(data, n, FFT_FORWARD);
        timer_stop(&timer);

        printf("Size %d: %.3f ms\n", n, timer.elapsed_ms);

        free_complex_array(data);
    }
}

Performance Tips

1. General Guidelines

Use power-of-2 sizes whenever possible
Align data to cache boundaries (64-byte alignment)
Minimize memory allocations in hot paths
Precompute twiddle factors for repeated transforms
Use appropriate data types (float vs double)

2. Platform-Specific

x86/x64

Enable AVX2/AVX-512 if available
Use -march=native compiler flag
Consider Intel MKL for production

ARM

Use NEON intrinsics
Optimize for specific ARM cores
Consider ARM Performance Libraries

Embedded Systems

Use fixed-point arithmetic
Minimize memory usage
Consider lookup tables for twiddle factors

3. Real-Time Considerations

For real-time applications:

// Pre-allocate all memory
typedef struct {
    complex_t* workspace;
    complex_t* twiddles;
    int size;
} realtime_fft_t;

realtime_fft_t* init_realtime_fft(int size) {
    realtime_fft_t* rt = malloc(sizeof(realtime_fft_t));
    rt->size = size;
    rt->workspace = allocate_complex_array(size);
    rt->twiddles = precompute_twiddles(size);
    return rt;
}

// Lock memory pages (Linux)
#include <sys/mman.h>
void lock_memory(void* addr, size_t size) {
    mlockall(MCL_CURRENT | MCL_FUTURE);
    mlock(addr, size);
}

Benchmarking Your Code

Simple Benchmark Template

void benchmark_my_application() {
    const int NUM_RUNS = 1000;
    const int WARMUP_RUNS = 100;

    // Warmup
    for (int i = 0; i < WARMUP_RUNS; i++) {
        run_my_fft();
    }

    // Actual benchmark
    fft_timer_t timer;
    timer_start(&timer);

    for (int i = 0; i < NUM_RUNS; i++) {
        run_my_fft();
    }

    timer_stop(&timer);

    double avg_time = timer.elapsed_ms / NUM_RUNS;
    printf("Average time: %.3f ms\n", avg_time);
    printf("Throughput: %.2f transforms/second\n", 1000.0 / avg_time);
}

Conclusion

Performance optimization is a balance between: - Algorithm complexity - Memory access patterns
- Hardware capabilities - Implementation complexity

Start with the right algorithm, then optimize based on profiling results. Remember that premature optimization is the root of all evil - profile first, optimize second!