From 08481e2a6885e738f2df0553534ddb3d51c63394 Mon Sep 17 00:00:00 2001 From: twinaphex Date: Wed, 9 Jun 2021 16:39:43 +0200 Subject: [PATCH] Rollback changes for now until we can get this compiling on MSVC 2005 on the commandline (it builds with the VS solution file but not CLI) --- .../audio/resampler/drivers/sinc_resampler.c | 816 +++++++++--------- 1 file changed, 389 insertions(+), 427 deletions(-) diff --git a/libretro-common/audio/resampler/drivers/sinc_resampler.c b/libretro-common/audio/resampler/drivers/sinc_resampler.c index b7bd8fc87c..2c3937e984 100644 --- a/libretro-common/audio/resampler/drivers/sinc_resampler.c +++ b/libretro-common/audio/resampler/drivers/sinc_resampler.c @@ -75,15 +75,16 @@ typedef struct rarch_sinc_resampler float *phase_table; float *buffer_l; float *buffer_r; + unsigned enable_avx; unsigned phase_bits; unsigned subphase_bits; unsigned subphase_mask; unsigned taps; unsigned ptr; - unsigned num_channels; uint32_t time; float subphase_mod; float kaiser_beta; + enum sinc_window window_type; } rarch_sinc_resampler_t; #if (defined(__ARM_NEON__) && !defined(DONT_WANT_ARM_OPTIMIZATIONS)) || defined(HAVE_NEON) @@ -153,89 +154,6 @@ static void resampler_sinc_process_neon(void *re_, struct resampler_data *data) #endif #if defined(__AVX__) -static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data) -{ - rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; - unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); - - uint32_t ratio = phases / data->ratio; - const float *input = data->data_in; - float *output = data->data_out; - size_t frames = data->input_frames; - size_t out_frames = 0; - - while (frames) - { - while (frames && resamp->time >= phases) - { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) - { - unsigned i; - unsigned phase = resamp->time >> resamp->subphase_bits; - - float *phase_table = resamp->phase_table + phase * taps * 2; - float *delta_table = phase_table + taps; - __m256 delta = _mm256_set1_ps((float) - (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); - - __m256 sum_l = _mm256_setzero_ps(); - __m256 sum_r = _mm256_setzero_ps(); - - for (i = 0; i < taps; i += 8) - { - __m256 buf_l = _mm256_loadu_ps(buffer_l + i); - __m256 buf_r = _mm256_loadu_ps(buffer_r + i); - __m256 deltas = _mm256_load_ps(delta_table + i); - __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i), - _mm256_mul_ps(deltas, delta)); - - sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); - sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); - } - - /* hadd on AVX is weird, and acts on low-lanes - * and high-lanes separately. */ - __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); - __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); - res_l = _mm256_hadd_ps(res_l, res_l); - res_r = _mm256_hadd_ps(res_r, res_r); - res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); - res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); - - /* This is optimized to mov %xmmN, [mem]. - * There doesn't seem to be any _mm256_store_ss intrinsic. */ - _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); - _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); - - output += 2; - out_frames++; - resamp->time += ratio; - } - } - } - - data->output_frames = out_frames; -} - static void resampler_sinc_process_avx(void *re_, struct resampler_data *data) { rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; @@ -247,66 +165,140 @@ static void resampler_sinc_process_avx(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) + while (frames && resamp->time >= phases) { - unsigned i; - __m256 delta; - unsigned phase = resamp->time >> resamp->subphase_bits; - float *phase_table = resamp->phase_table + phase * taps; + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; - __m256 sum_l = _mm256_setzero_ps(); - __m256 sum_r = _mm256_setzero_ps(); + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; - for (i = 0; i < taps; i += 8) + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) { - __m256 buf_l = _mm256_loadu_ps(buffer_l + i); - __m256 buf_r = _mm256_loadu_ps(buffer_r + i); - __m256 sinc = _mm256_load_ps((const float*)phase_table + i); + unsigned i; + unsigned phase = resamp->time >> resamp->subphase_bits; - sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); - sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + __m256 delta = _mm256_set1_ps((float) + (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); + + __m256 sum_l = _mm256_setzero_ps(); + __m256 sum_r = _mm256_setzero_ps(); + + for (i = 0; i < taps; i += 8) + { + __m256 buf_l = _mm256_loadu_ps(buffer_l + i); + __m256 buf_r = _mm256_loadu_ps(buffer_r + i); + __m256 deltas = _mm256_load_ps(delta_table + i); + __m256 sinc = _mm256_add_ps(_mm256_load_ps((const float*)phase_table + i), + _mm256_mul_ps(deltas, delta)); + + sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); + sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); + } + + /* hadd on AVX is weird, and acts on low-lanes + * and high-lanes separately. */ + __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); + __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); + res_l = _mm256_hadd_ps(res_l, res_l); + res_r = _mm256_hadd_ps(res_r, res_r); + res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); + res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + + /* This is optimized to mov %xmmN, [mem]. + * There doesn't seem to be any _mm256_store_ss intrinsic. */ + _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); + _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + + output += 2; + out_frames++; + resamp->time += ratio; } + } + } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; - /* hadd on AVX is weird, and acts on low-lanes - * and high-lanes separately. */ - __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); - __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); - res_l = _mm256_hadd_ps(res_l, res_l); - res_r = _mm256_hadd_ps(res_r, res_r); - res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); - res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; - /* This is optimized to mov %xmmN, [mem]. - * There doesn't seem to be any _mm256_store_ss intrinsic. */ - _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); - _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; - output += 2; - out_frames++; - resamp->time += ratio; + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) + { + unsigned i; + __m256 delta; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; + + __m256 sum_l = _mm256_setzero_ps(); + __m256 sum_r = _mm256_setzero_ps(); + + for (i = 0; i < taps; i += 8) + { + __m256 buf_l = _mm256_loadu_ps(buffer_l + i); + __m256 buf_r = _mm256_loadu_ps(buffer_r + i); + __m256 sinc = _mm256_load_ps((const float*)phase_table + i); + + sum_l = _mm256_add_ps(sum_l, _mm256_mul_ps(buf_l, sinc)); + sum_r = _mm256_add_ps(sum_r, _mm256_mul_ps(buf_r, sinc)); + } + + /* hadd on AVX is weird, and acts on low-lanes + * and high-lanes separately. */ + __m256 res_l = _mm256_hadd_ps(sum_l, sum_l); + __m256 res_r = _mm256_hadd_ps(sum_r, sum_r); + res_l = _mm256_hadd_ps(res_l, res_l); + res_r = _mm256_hadd_ps(res_r, res_r); + res_l = _mm256_add_ps(_mm256_permute2f128_ps(res_l, res_l, 1), res_l); + res_r = _mm256_add_ps(_mm256_permute2f128_ps(res_r, res_r, 1), res_r); + + /* This is optimized to mov %xmmN, [mem]. + * There doesn't seem to be any _mm256_store_ss intrinsic. */ + _mm_store_ss(output + 0, _mm256_extractf128_ps(res_l, 0)); + _mm_store_ss(output + 1, _mm256_extractf128_ps(res_r, 0)); + + output += 2; + out_frames++; + resamp->time += ratio; + } } } } @@ -316,104 +308,6 @@ static void resampler_sinc_process_avx(void *re_, struct resampler_data *data) #endif #if defined(__SSE__) -static void resampler_sinc_process_sse_kaiser(void *re_, struct resampler_data *data) -{ - rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; - unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); - - uint32_t ratio = phases / data->ratio; - const float *input = data->data_in; - float *output = data->data_out; - size_t frames = data->input_frames; - size_t out_frames = 0; - - while (frames) - { - while (frames && resamp->time >= phases) - { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) - { - unsigned i; -#if 0 - __m128 sum; -#endif - unsigned phase = resamp->time >> resamp->subphase_bits; - float *phase_table = resamp->phase_table + phase * taps * 2; - float *delta_table = phase_table + taps; - __m128 delta = _mm_set1_ps((float) - (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); - - __m128 sum_l = _mm_setzero_ps(); - __m128 sum_r = _mm_setzero_ps(); - - for (i = 0; i < taps; i += 4) - { - __m128 buf_l = _mm_loadu_ps(buffer_l + i); - __m128 buf_r = _mm_loadu_ps(buffer_r + i); - __m128 deltas = _mm_load_ps(delta_table + i); - __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i), - _mm_mul_ps(deltas, delta)); - sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); - sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); - } - -#ifdef HAVE_GRIFFIN - /* Them annoying shuffles. - * sum_l = { l3, l2, l1, l0 } - * sum_r = { r3, r2, r1, r0 } - */ - sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, - _MM_SHUFFLE(1, 0, 1, 0)), - _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); - /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } - * sum = { R1, R0, L1, L0 } - */ - sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); - /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } - * sum = { X, R, X, L } - */ - /* Store L */ - _mm_store_ss(output++, sum); - /* movehl { X, R, X, L } == { X, R, X, R } */ - _mm_store_ss(output++, _mm_movehl_ps(sum, sum)); -#else -#ifdef _MSC_VER - *(output++) = _mm_cvtss_f32(sum_l) + sum_l.m128_f32[1] + sum_l.m128_f32[2] + sum_l.m128_f32[3]; - *(output++) = _mm_cvtss_f32(sum_r) + sum_r.m128_f32[1] + sum_r.m128_f32[2] + sum_r.m128_f32[3]; -#else - *(output++) = _mm_cvtss_f32(sum_l) + sum_l[1] + sum_l[2] + sum_l[3]; - *(output++) = _mm_cvtss_f32(sum_r) + sum_r[1] + sum_r[2] + sum_r[3]; -#endif -#endif - - out_frames++; - resamp->time += ratio; - } - } - } - - data->output_frames = out_frames; -} - static void resampler_sinc_process_sse(void *re_, struct resampler_data *data) { rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; @@ -425,81 +319,160 @@ static void resampler_sinc_process_sse(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) + while (frames && resamp->time >= phases) { - unsigned i; -#if 0 - __m128 sum; -#endif - unsigned phase = resamp->time >> resamp->subphase_bits; - float *phase_table = resamp->phase_table + phase * taps; + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; - __m128 sum_l = _mm_setzero_ps(); - __m128 sum_r = _mm_setzero_ps(); + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; - for (i = 0; i < taps; i += 4) + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) { - __m128 buf_l = _mm_loadu_ps(buffer_l + i); - __m128 buf_r = _mm_loadu_ps(buffer_r + i); - __m128 _sinc = _mm_load_ps((const float*)phase_table + i); - sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); - sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); + unsigned i; + __m128 sum; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + __m128 delta = _mm_set1_ps((float) + (resamp->time & resamp->subphase_mask) * resamp->subphase_mod); + + __m128 sum_l = _mm_setzero_ps(); + __m128 sum_r = _mm_setzero_ps(); + + for (i = 0; i < taps; i += 4) + { + __m128 buf_l = _mm_loadu_ps(buffer_l + i); + __m128 buf_r = _mm_loadu_ps(buffer_r + i); + __m128 deltas = _mm_load_ps(delta_table + i); + __m128 _sinc = _mm_add_ps(_mm_load_ps((const float*)phase_table + i), + _mm_mul_ps(deltas, delta)); + sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); + sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); + } + + /* Them annoying shuffles. + * sum_l = { l3, l2, l1, l0 } + * sum_r = { r3, r2, r1, r0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, + _MM_SHUFFLE(1, 0, 1, 0)), + _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); + + /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } + * sum = { R1, R0, L1, L0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); + + /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } + * sum = { X, R, X, L } + */ + + /* Store L */ + _mm_store_ss(output + 0, sum); + + /* movehl { X, R, X, L } == { X, R, X, R } */ + _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); + + output += 2; + out_frames++; + resamp->time += ratio; } + } + } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; -#if 0 - /* Them annoying shuffles. - * sum_l = { l3, l2, l1, l0 } - * sum_r = { r3, r2, r1, r0 } - */ - sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, - _MM_SHUFFLE(1, 0, 1, 0)), - _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); - /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } - * sum = { R1, R0, L1, L0 } - */ - sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); - /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } - * sum = { X, R, X, L } - */ - /* Store L */ - _mm_store_ss(output++, sum); - /* movehl { X, R, X, L } == { X, R, X, R } */ - _mm_store_ss(output++, _mm_movehl_ps(sum, sum)); -#else -#ifdef _MSC_VER - *(output++) = _mm_cvtss_f32(sum_l) + sum_l.m128_f32[1] + sum_l.m128_f32[2] + sum_l.m128_f32[3]; - *(output++) = _mm_cvtss_f32(sum_r) + sum_r.m128_f32[1] + sum_r.m128_f32[2] + sum_r.m128_f32[3]; -#else - *(output++) = _mm_cvtss_f32(sum_l) + sum_l[1] + sum_l[2] + sum_l[3]; - *(output++) = _mm_cvtss_f32(sum_r) + sum_r[1] + sum_r[2] + sum_r[3]; -#endif -#endif + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; - out_frames++; - resamp->time += ratio; + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) + { + unsigned i; + __m128 sum; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; + + __m128 sum_l = _mm_setzero_ps(); + __m128 sum_r = _mm_setzero_ps(); + + for (i = 0; i < taps; i += 4) + { + __m128 buf_l = _mm_loadu_ps(buffer_l + i); + __m128 buf_r = _mm_loadu_ps(buffer_r + i); + __m128 _sinc = _mm_load_ps((const float*)phase_table + i); + sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); + sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); + } + + /* Them annoying shuffles. + * sum_l = { l3, l2, l1, l0 } + * sum_r = { r3, r2, r1, r0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, + _MM_SHUFFLE(1, 0, 1, 0)), + _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); + + /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } + * sum = { R1, R0, L1, L0 } + */ + + sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); + + /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } + * sum = { X, R, X, L } + */ + + /* Store L */ + _mm_store_ss(output + 0, sum); + + /* movehl { X, R, X, L } == { X, R, X, R } */ + _mm_store_ss(output + 1, _mm_movehl_ps(sum, sum)); + + output += 2; + out_frames++; + resamp->time += ratio; + } } } } @@ -508,72 +481,6 @@ static void resampler_sinc_process_sse(void *re_, struct resampler_data *data) } #endif -static void resampler_sinc_process_c_kaiser(void *re_, struct resampler_data *data) -{ - rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; - unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits); - - uint32_t ratio = phases / data->ratio; - const float *input = data->data_in; - float *output = data->data_out; - size_t frames = data->input_frames; - size_t out_frames = 0; - - while (frames) - { - while (frames && resamp->time >= phases) - { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) - { - unsigned i; - float sum_l = 0.0f; - float sum_r = 0.0f; - unsigned phase = resamp->time >> resamp->subphase_bits; - float *phase_table = resamp->phase_table + phase * taps * 2; - float *delta_table = phase_table + taps; - float delta = (float) - (resamp->time & resamp->subphase_mask) * resamp->subphase_mod; - - for (i = 0; i < taps; i++) - { - float sinc_val = phase_table[i] + delta_table[i] * delta; - - sum_l += buffer_l[i] * sinc_val; - sum_r += buffer_r[i] * sinc_val; - } - - *output++ = sum_l; - *output++ = sum_r; - - out_frames++; - resamp->time += ratio; - } - } - - } - - data->output_frames = out_frames; -} - static void resampler_sinc_process_c(void *re_, struct resampler_data *data) { rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_; @@ -585,53 +492,112 @@ static void resampler_sinc_process_c(void *re_, struct resampler_data *data) size_t frames = data->input_frames; size_t out_frames = 0; - while (frames) + if (resamp->window_type == SINC_WINDOW_KAISER) { - while (frames && resamp->time >= phases) + while (frames) { - /* Push in reverse to make filter more obvious. */ - if (!resamp->ptr) - resamp->ptr = resamp->taps; - resamp->ptr--; - - resamp->buffer_l[resamp->ptr + resamp->taps] = - resamp->buffer_l[resamp->ptr] = *input++; - - resamp->buffer_r[resamp->ptr + resamp->taps] = - resamp->buffer_r[resamp->ptr] = *input++; - - resamp->time -= phases; - frames--; - } - - { - const float *buffer_l = resamp->buffer_l + resamp->ptr; - const float *buffer_r = resamp->buffer_r + resamp->ptr; - unsigned taps = resamp->taps; - while (resamp->time < phases) + while (frames && resamp->time >= phases) { - unsigned i; - float sum_l = 0.0f; - float sum_r = 0.0f; - unsigned phase = resamp->time >> resamp->subphase_bits; - float *phase_table = resamp->phase_table + phase * taps; + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; - for (i = 0; i < taps; i++) - { - float sinc_val = phase_table[i]; + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; - sum_l += buffer_l[i] * sinc_val; - sum_r += buffer_r[i] * sinc_val; - } + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; - *output++ = sum_l; - *output++ = sum_r; - - out_frames++; - resamp->time += ratio; + resamp->time -= phases; + frames--; } - } + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) + { + unsigned i; + float sum_l = 0.0f; + float sum_r = 0.0f; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps * 2; + float *delta_table = phase_table + taps; + float delta = (float) + (resamp->time & resamp->subphase_mask) * resamp->subphase_mod; + + for (i = 0; i < taps; i++) + { + float sinc_val = phase_table[i] + delta_table[i] * delta; + + sum_l += buffer_l[i] * sinc_val; + sum_r += buffer_r[i] * sinc_val; + } + + output[0] = sum_l; + output[1] = sum_r; + + output += 2; + out_frames++; + resamp->time += ratio; + } + } + + } + } + else + { + while (frames) + { + while (frames && resamp->time >= phases) + { + /* Push in reverse to make filter more obvious. */ + if (!resamp->ptr) + resamp->ptr = resamp->taps; + resamp->ptr--; + + resamp->buffer_l[resamp->ptr + resamp->taps] = + resamp->buffer_l[resamp->ptr] = *input++; + + resamp->buffer_r[resamp->ptr + resamp->taps] = + resamp->buffer_r[resamp->ptr] = *input++; + + resamp->time -= phases; + frames--; + } + + { + const float *buffer_l = resamp->buffer_l + resamp->ptr; + const float *buffer_r = resamp->buffer_r + resamp->ptr; + unsigned taps = resamp->taps; + while (resamp->time < phases) + { + unsigned i; + float sum_l = 0.0f; + float sum_r = 0.0f; + unsigned phase = resamp->time >> resamp->subphase_bits; + float *phase_table = resamp->phase_table + phase * taps; + + for (i = 0; i < taps; i++) + { + float sinc_val = phase_table[i]; + + sum_l += buffer_l[i] * sinc_val; + sum_r += buffer_r[i] * sinc_val; + } + + output[0] = sum_l; + output[1] = sum_r; + + output += 2; + out_frames++; + resamp->time += ratio; + } + } + + } } data->output_frames = out_frames; @@ -769,14 +735,14 @@ static void *resampler_sinc_new(const struct resampler_config *config, size_t phase_elems = 0; size_t elems = 0; unsigned sidelobes = 0; - unsigned enable_avx = 0; - enum sinc_window window_type = SINC_WINDOW_NONE; rarch_sinc_resampler_t *re = (rarch_sinc_resampler_t*) calloc(1, sizeof(*re)); if (!re) return NULL; + re->window_type = SINC_WINDOW_NONE; + switch (quality) { case RESAMPLER_QUALITY_LOWEST: @@ -784,32 +750,34 @@ static void *resampler_sinc_new(const struct resampler_config *config, sidelobes = 2; re->phase_bits = 12; re->subphase_bits = 10; - window_type = SINC_WINDOW_LANCZOS; + re->window_type = SINC_WINDOW_LANCZOS; + re->enable_avx = 0; break; case RESAMPLER_QUALITY_LOWER: cutoff = 0.98; sidelobes = 4; re->phase_bits = 12; re->subphase_bits = 10; - window_type = SINC_WINDOW_LANCZOS; + re->window_type = SINC_WINDOW_LANCZOS; + re->enable_avx = 0; break; case RESAMPLER_QUALITY_HIGHER: cutoff = 0.90; sidelobes = 32; re->phase_bits = 10; re->subphase_bits = 14; + re->window_type = SINC_WINDOW_KAISER; re->kaiser_beta = 10.5; - enable_avx = 1; - window_type = SINC_WINDOW_KAISER; + re->enable_avx = 1; break; case RESAMPLER_QUALITY_HIGHEST: cutoff = 0.962; sidelobes = 128; re->phase_bits = 10; re->subphase_bits = 14; + re->window_type = SINC_WINDOW_KAISER; re->kaiser_beta = 14.5; - enable_avx = 1; - window_type = SINC_WINDOW_KAISER; + re->enable_avx = 1; break; case RESAMPLER_QUALITY_NORMAL: case RESAMPLER_QUALITY_DONTCARE: @@ -817,14 +785,14 @@ static void *resampler_sinc_new(const struct resampler_config *config, sidelobes = 8; re->phase_bits = 8; re->subphase_bits = 16; + re->window_type = SINC_WINDOW_KAISER; re->kaiser_beta = 5.5; - window_type = SINC_WINDOW_KAISER; + re->enable_avx = 0; break; } re->subphase_mask = (1 << re->subphase_bits) - 1; re->subphase_mod = 1.0f / (1 << re->subphase_bits); - re->num_channels = 2; re->taps = sidelobes * 2; /* Downsampling, must lower cutoff, and extend number of @@ -837,7 +805,7 @@ static void *resampler_sinc_new(const struct resampler_config *config, /* Be SIMD-friendly. */ #if defined(__AVX__) - if (enable_avx) + if (re->enable_avx) re->taps = (re->taps + 7) & ~7; else #endif @@ -850,7 +818,7 @@ static void *resampler_sinc_new(const struct resampler_config *config, } phase_elems = ((1 << re->phase_bits) * re->taps); - if (window_type == SINC_WINDOW_KAISER) + if (re->window_type == SINC_WINDOW_KAISER) phase_elems = phase_elems * 2; elems = phase_elems + 4 * re->taps; @@ -864,7 +832,7 @@ static void *resampler_sinc_new(const struct resampler_config *config, re->buffer_l = re->main_buffer + phase_elems; re->buffer_r = re->buffer_l + 2 * re->taps; - switch (window_type) + switch (re->window_type) { case SINC_WINDOW_LANCZOS: sinc_init_table_lanczos(re, cutoff, re->phase_table, @@ -878,30 +846,24 @@ static void *resampler_sinc_new(const struct resampler_config *config, goto error; } - sinc_resampler.process = resampler_sinc_process_c; - if (window_type == SINC_WINDOW_KAISER) - sinc_resampler.process = resampler_sinc_process_c_kaiser; + sinc_resampler.process = resampler_sinc_process_c; - if (mask & RESAMPLER_SIMD_AVX && enable_avx) + if (mask & RESAMPLER_SIMD_AVX && re->enable_avx) { #if defined(__AVX__) - sinc_resampler.process = resampler_sinc_process_avx; - if (window_type == SINC_WINDOW_KAISER) - sinc_resampler.process = resampler_sinc_process_avx_kaiser; + sinc_resampler.process = resampler_sinc_process_avx; #endif } else if (mask & RESAMPLER_SIMD_SSE) { #if defined(__SSE__) - sinc_resampler.process = resampler_sinc_process_sse; - if (window_type == SINC_WINDOW_KAISER) - sinc_resampler.process = resampler_sinc_process_sse_kaiser; + sinc_resampler.process = resampler_sinc_process_sse; #endif } - else if (mask & RESAMPLER_SIMD_NEON && window_type != SINC_WINDOW_KAISER) + else if (mask & RESAMPLER_SIMD_NEON && re->window_type != SINC_WINDOW_KAISER) { #if defined(WANT_NEON) - sinc_resampler.process = resampler_sinc_process_neon; + sinc_resampler.process = resampler_sinc_process_neon; #endif }