From aebe6bcd230f96c0aaf0e5dec978834ca4a41e99 Mon Sep 17 00:00:00 2001 From: Themaister Date: Tue, 17 May 2011 02:45:10 +0200 Subject: [PATCH] Add example SSE3-optimized IIR filter. --- audio/ext/ssnes_iir_filter.c | 240 +++++++++++++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 audio/ext/ssnes_iir_filter.c diff --git a/audio/ext/ssnes_iir_filter.c b/audio/ext/ssnes_iir_filter.c new file mode 100644 index 0000000000..e6c9c7d8b9 --- /dev/null +++ b/audio/ext/ssnes_iir_filter.c @@ -0,0 +1,240 @@ +#include "ssnes_dsp.h" +#include +#include +#include + +// Simple IIR filter implementation. + +//#define USE_SSE3 + +#ifdef USE_SSE3 +#include +#endif + +// Make a test build with standalone main() +//#define TEST + +#ifdef TEST +#include +#include +#endif + +#define min(x, y) ((x) < (y) ? (x) : (y)) + +// Taps needs to be power-of-two. +#define TAPS (128) + +// FIR coefficients, up to TAPS taps. +// Basic passthrough. +static const float fir_coeff[] = { + 1.0 +}; + +// IIR coefficients, up to TAPS taps. +static const float iir_coeff[] = { +}; + +#ifdef USE_SSE3 +typedef union +{ + __m128 vec[TAPS / 4 + 1]; // Wraparound for unaligned reads. + float f[TAPS + 4]; +} vec_filt; +#else +typedef float vec_filt[TAPS + 4]; +#endif + +typedef struct +{ + vec_filt buffer[2]; + vec_filt iir_buffer[2]; + vec_filt fir_coeff; + vec_filt iir_coeff; + + float out_buffer[4096]; + unsigned buf_ptr; +} dsp_state; + +static void* dsp_init(const ssnes_dsp_info_t *info) +{ + (void)info; + dsp_state *state = calloc(1, sizeof(*state)); + if (!state) + return NULL; + + memcpy(&state->fir_coeff, fir_coeff, min(sizeof(fir_coeff), TAPS * sizeof(float))); + memcpy(&state->iir_coeff, iir_coeff, min(sizeof(iir_coeff), TAPS * sizeof(float))); +#ifdef USE_SSE3 + memcpy(state->fir_coeff.f + TAPS, fir_coeff, min(sizeof(fir_coeff), 4 * sizeof(float))); + memcpy(state->iir_coeff.f + TAPS, iir_coeff, min(sizeof(iir_coeff), 4 * sizeof(float))); +#endif + + return state; +} + +#ifdef USE_SSE3 +static void calculate_iir(float *out, dsp_state *dsp) +{ + unsigned buf_ptr = (dsp->buf_ptr - 1) & (TAPS - 1); + unsigned iir_ptr = (TAPS - dsp->buf_ptr) & (TAPS - 1); + + const float * restrict samples_left = dsp->buffer[0].f; + const float * restrict samples_right = dsp->buffer[1].f; + const float * restrict iir_left = dsp->iir_buffer[0].f; + const float * restrict iir_right = dsp->iir_buffer[1].f; + + __m128 sum_left = _mm_setzero_ps(); + __m128 sum_right = _mm_setzero_ps(); + + for (unsigned i = 0; i < TAPS; i += 4) + { + __m128 left = _mm_load_ps(samples_left + i); + __m128 right = _mm_load_ps(samples_right + i); + const __m128 ileft = _mm_load_ps(iir_left + i); + const __m128 iright = _mm_load_ps(iir_right + i); + + // Need unaligned reads here. + const __m128 fir = _mm_loadu_ps(dsp->fir_coeff.f + iir_ptr); + const __m128 iir = _mm_loadu_ps(dsp->iir_coeff.f + iir_ptr); + + const __m128 fir_res_left = _mm_mul_ps(fir, left); + const __m128 fir_res_right = _mm_mul_ps(fir, right); + const __m128 iir_res_left = _mm_mul_ps(iir, ileft); + const __m128 iir_res_right = _mm_mul_ps(iir, iright); + + left = _mm_add_ps(fir_res_left, iir_res_left); + right = _mm_add_ps(fir_res_right, iir_res_right); + + sum_left = _mm_add_ps(sum_left, left); + sum_right = _mm_add_ps(sum_right, right); + + iir_ptr = (iir_ptr + 4) & (TAPS - 1); + } + + __m128 res = _mm_hadd_ps(sum_left, sum_right); + res = _mm_hadd_ps(res, res); + + union + { + __m128 vec; + float f[4]; + } u; + u.vec = res; + + out[0] = u.f[0]; + out[1] = u.f[1]; + dsp->iir_buffer[0].f[buf_ptr] = u.f[0]; + dsp->iir_buffer[1].f[buf_ptr] = u.f[1]; +} +#else +static void calculate_iir(float *out, dsp_state *dsp) +{ + const float * restrict samples_left = dsp->buffer[0]; + const float * restrict samples_right = dsp->buffer[1]; + const float * restrict iir_left = dsp->iir_buffer[0]; + const float * restrict iir_right = dsp->iir_buffer[1]; + const float * restrict fir = dsp->fir_coeff; + const float * restrict iir = dsp->iir_coeff; + + unsigned iir_ptr = 0; + unsigned sample_ptr = dsp->buf_ptr; + + float sum[2] = { 0.0f, 0.0f }; + for (unsigned i = 0; i < TAPS; i++) + { + sum[0] += fir[iir_ptr] * samples_left[sample_ptr] + iir[iir_ptr] * iir_left[sample_ptr]; + sum[1] += fir[iir_ptr] * samples_right[sample_ptr] + iir[iir_ptr] * iir_right[sample_ptr]; + iir_ptr++; + sample_ptr = (sample_ptr + 1) & (TAPS - 1); + } + + // Stick our output value in the IIR buffer. + for (unsigned i = 0; i < 2; i++) + { + out[i] = sum[i]; + dsp->iir_buffer[i][(dsp->buf_ptr - 1) & (TAPS - 1)] = sum[i]; + } +} +#endif + +static void dsp_process(void *data, ssnes_dsp_output_t *output, const ssnes_dsp_input_t *input) +{ + dsp_state *dsp = data; + output->should_resample = SSNES_TRUE; + output->frames = input->frames; + output->samples = dsp->out_buffer; + + for (unsigned i = 0; i < input->frames; i++) + { +#ifdef USE_SSE3 + dsp->buffer[0].f[dsp->buf_ptr] = input->samples[(i << 1) + 0]; + dsp->buffer[1].f[dsp->buf_ptr] = input->samples[(i << 1) + 1]; +#else + dsp->buffer[0][dsp->buf_ptr] = input->samples[(i << 1) + 0]; + dsp->buffer[1][dsp->buf_ptr] = input->samples[(i << 1) + 1]; +#endif + + calculate_iir(&dsp->out_buffer[i << 1], dsp); + dsp->buf_ptr = (dsp->buf_ptr - 1) & (TAPS - 1); + } +} + +static void dsp_free(void *data) +{ + free(data); +} + +const ssnes_dsp_plugin_t dsp_plug = { + .init = dsp_init, + .process = dsp_process, + .free = dsp_free, + .api_version = SSNES_API_VERSION +}; + +SSNES_API_EXPORT const ssnes_dsp_plugin_t* + SSNES_API_CALLTYPE ssnes_dsp_plugin_init(void) +{ + return &dsp_plug; +} + +#ifdef TEST +int main(void) +{ + const ssnes_dsp_plugin_t *plug = ssnes_dsp_plugin_init(); + assert(plug); + + ssnes_dsp_info_t info; + void *handle = plug->init(&info); // Info isn't used. + assert(handle); + + int16_t buf[64]; + float fbuf[64]; + + for (;;) + { + size_t rd = fread(buf, sizeof(int16_t), 64, stdin); + for (unsigned i = 0; i < rd; i++) + fbuf[i] = (float)buf[i]; + + ssnes_dsp_input_t input = { + .samples = fbuf, + .frames = rd >> 1 + }; + ssnes_dsp_output_t output; + plug->process(handle, &output, &input); + + for (unsigned i = 0; i < output.frames << 1; i++) + { + int32_t sample = (int32_t)output.samples[i]; + buf[i] = (sample > 0x7fff) ? 0x7fff : ((sample < -0x8000) ? -0x8000 : (int16_t)sample); + } + + fwrite(buf, sizeof(int16_t), output.frames << 1, stdout); + + if (rd < 64) + break; + } + + plug->free(handle); +} +#endif