From 0496ffc00773ddbe734affe286fef1a284c33a96 Mon Sep 17 00:00:00 2001 From: Themaister Date: Fri, 24 Feb 2012 21:01:29 +0100 Subject: [PATCH] Optimize sinc resampler a bit. --- audio/sinc.c | 155 +++++++++++++++++++++------------------------------ 1 file changed, 65 insertions(+), 90 deletions(-) diff --git a/audio/sinc.c b/audio/sinc.c index fade12bbb6..2e50f1398f 100644 --- a/audio/sinc.c +++ b/audio/sinc.c @@ -19,6 +19,7 @@ // Only suitable as an upsampler, as there is no low-pass filter stage. #include "resampler.h" +#include "../general.h" #include #include #include @@ -29,6 +30,10 @@ #include #endif +#if __SSE3__ +#include +#endif + #define PHASE_BITS 8 #define SUBPHASE_BITS 16 @@ -41,11 +46,14 @@ #define FRAMES_SHIFT (PHASE_BITS + SUBPHASE_BITS) #define SIDELOBES 8 +#define TAPS (SIDELOBES * 2) + +#define PHASE_INDEX 0 +#define DELTA_INDEX 1 struct ssnes_resampler { - float phase_table[PHASES + 1][SIDELOBES]; - float delta_table[PHASES + 1][SIDELOBES]; + float phase_table[PHASES][2][2 * SIDELOBES]; float buffer_l[2 * SIDELOBES]; float buffer_r[2 * SIDELOBES]; @@ -75,19 +83,26 @@ static inline double blackman(double index) static void init_sinc_table(ssnes_resampler_t *resamp) { - for (unsigned i = 0; i <= PHASES; i++) + // Sinc phases: [..., p + 3, p + 2, p + 1, p + 0, p - 1, p - 2, p - 3, p - 4, ...] + for (int i = 0; i < PHASES; i++) { - for (unsigned j = 0; j < SIDELOBES; j++) + for (int j = 0; j < 2 * SIDELOBES; j++) { - double sinc_phase = M_PI * ((double)i / PHASES + (double)j); - resamp->phase_table[i][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES); + double p = (double)i / PHASES; + double sinc_phase = M_PI * (p + (SIDELOBES - 1 - j)); + resamp->phase_table[i][PHASE_INDEX][j] = sinc(sinc_phase) * blackman(sinc_phase / SIDELOBES); } } // Optimize linear interpolation. - for (unsigned i = 0; i < PHASES; i++) - for (unsigned j = 0; j < SIDELOBES; j++) - resamp->delta_table[i][j] = resamp->phase_table[i + 1][j] - resamp->phase_table[i][j]; + for (int i = 0; i < PHASES - 1; i++) + { + for (int j = 0; j < 2 * SIDELOBES; j++) + { + resamp->phase_table[i][DELTA_INDEX][j] = + (resamp->phase_table[i + 1][PHASE_INDEX][j] - resamp->phase_table[i][PHASE_INDEX][j]) / SUBPHASES; + } + } } ssnes_resampler_t *resampler_new(void) @@ -99,6 +114,15 @@ ssnes_resampler_t *resampler_new(void) memset(re, 0, sizeof(*re)); init_sinc_table(re); + +#if __SSE3__ + SSNES_LOG("Sinc resampler [SSE3]\n"); +#elif __SSE__ + SSNES_LOG("Sinc resampler [SSE]\n"); +#else + SSNES_LOG("Sinc resampler [C]\n"); +#endif + return re; } @@ -113,68 +137,41 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer) unsigned phase = resamp->time >> PHASES_SHIFT; unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK; - __m128 delta_f = _mm_set1_ps((float)delta / SUBPHASES); + __m128 delta_f = _mm_set1_ps(delta); - const float *phase_table = resamp->phase_table[phase]; - const float *delta_table = resamp->delta_table[phase]; + const float *phase_table = resamp->phase_table[phase][PHASE_INDEX]; + const float *delta_table = resamp->phase_table[phase][DELTA_INDEX]; - __m128 sinc_vals[SIDELOBES / 4]; - for (unsigned i = 0; i < SIDELOBES; i += 4) + for (unsigned i = 0; i < TAPS; i += 4) { - __m128 phases = _mm_load_ps(phase_table + i); - __m128 deltas = _mm_load_ps(delta_table + i); - sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); + __m128 buf_l = _mm_load_ps(buffer_l + i); + __m128 buf_r = _mm_load_ps(buffer_r + i); + + __m128 phases = _mm_load_ps(phase_table + i); + __m128 deltas = _mm_load_ps(delta_table + i); + + __m128 sinc = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); + + sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc)); + sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc)); } - // Older data. - for (unsigned i = 0; i < SIDELOBES; i += 4) - { - __m128 buf_l = _mm_loadr_ps(buffer_l + SIDELOBES - 4 - i); - sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4])); - - __m128 buf_r = _mm_loadr_ps(buffer_r + SIDELOBES - 4 - i); - sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4])); - } - - // Newer data. - unsigned reverse_phase = PHASES_WRAP - resamp->time; - phase = reverse_phase >> PHASES_SHIFT; - delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK; - delta_f = _mm_set1_ps((float)delta / SUBPHASES); - - phase_table = resamp->phase_table[phase]; - delta_table = resamp->delta_table[phase]; - - for (unsigned i = 0; i < SIDELOBES; i += 4) - { - __m128 phases = _mm_load_ps(phase_table + i); - __m128 deltas = _mm_load_ps(delta_table + i); - sinc_vals[i / 4] = _mm_add_ps(phases, _mm_mul_ps(deltas, delta_f)); - } - - for (unsigned i = 0; i < SIDELOBES; i += 4) - { - __m128 buf_l = _mm_load_ps(buffer_l + SIDELOBES + i); - sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, sinc_vals[i / 4])); - - __m128 buf_r = _mm_load_ps(buffer_r + SIDELOBES + i); - sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, sinc_vals[i / 4])); - } - - // This can be done faster with _mm_hadd_ps(), but it's SSE3 :( - __m128 sum_shuf_l = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)); - __m128 sum_shuf_r = _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)); - __m128 sum = _mm_add_ps(sum_shuf_l, sum_shuf_r); - +#if __SSE3__ + __m128 res = _mm_hadd_ps(_mm_hadd_ps(sum_l, sum_r), _mm_setzero_ps()); + _mm_storeu_ps(out_buffer, res); // Overwriting, but this is safe. +#else // Meh, compiler should optimize this to something sane. union { float f[4]; __m128 v; - } u; - u.v = sum; + } u[2] = { + [0] = { .v = sum_l }, + [1] = { .v = sum_r }, + }; - out_buffer[0] = u.f[0] + u.f[1]; - out_buffer[1] = u.f[2] + u.f[3]; + out_buffer[0] = u[0].f[0] + u[0].f[1] + u[0].f[2] + u[0].f[3]; + out_buffer[1] = u[1].f[0] + u[1].f[1] + u[1].f[2] + u[1].f[3]; +#endif } #else // Plain ol' C99 static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer) @@ -186,38 +183,16 @@ static void process_sinc(ssnes_resampler_t *resamp, float *out_buffer) unsigned phase = resamp->time >> PHASES_SHIFT; unsigned delta = (resamp->time >> SUBPHASES_SHIFT) & SUBPHASES_MASK; - float delta_f = (float)delta / SUBPHASES; + float delta_f = (float)delta; - const float *phase_table = resamp->phase_table[phase]; - const float *delta_table = resamp->delta_table[phase]; + const float *phase_table = resamp->phase_table[phase][PHASE_INDEX]; + const float *delta_table = resamp->phase_table[phase][DELTA_INDEX]; - float sinc_vals[SIDELOBES]; - for (unsigned i = 0; i < SIDELOBES; i++) - sinc_vals[i] = phase_table[i] + delta_f * delta_table[i]; - - // Older data. - for (unsigned i = 0; i < SIDELOBES; i++) + for (unsigned i = 0; i < TAPS; i++) { - sum_l += buffer_l[SIDELOBES - 1 - i] * sinc_vals[i]; - sum_r += buffer_r[SIDELOBES - 1 - i] * sinc_vals[i]; - } - - // Newer data. - unsigned reverse_phase = PHASES_WRAP - resamp->time; - phase = reverse_phase >> PHASES_SHIFT; - delta = (reverse_phase >> SUBPHASES_SHIFT) & SUBPHASES_MASK; - delta_f = (float)delta / SUBPHASES; - - phase_table = resamp->phase_table[phase]; - delta_table = resamp->delta_table[phase]; - - for (unsigned i = 0; i < SIDELOBES; i++) - sinc_vals[i] = phase_table[i] + delta_f * delta_table[i]; - - for (unsigned i = 0; i < SIDELOBES; i++) - { - sum_l += buffer_l[SIDELOBES + i] * sinc_vals[i]; - sum_r += buffer_r[SIDELOBES + i] * sinc_vals[i]; + float sinc_val = phase_table[i] + delta_f * delta_table[i]; + sum_l += buffer_l[i] * sinc_val; + sum_r += buffer_r[i] * sinc_val; } out_buffer[0] = sum_l;