(sinc) Add non-ASM ARM NEON intrinsics codepath for Sinc resampler

This commit is contained in:
twinaphex 2021-08-21 18:52:20 +02:00
parent b620325714
commit ca26e50050

View File

@ -85,16 +85,81 @@ typedef struct rarch_sinc_resampler
float kaiser_beta;
} rarch_sinc_resampler_t;
#if (defined(__ARM_NEON__) && !defined(DONT_WANT_ARM_ASM_OPTIMIZATIONS)) || defined(HAVE_NEON)
#if TARGET_OS_IPHONE
#else
#ifndef WANT_NEON
#define WANT_NEON
#endif
#endif
#endif
#if (defined(__ARM_NEON__) || defined(HAVE_NEON)
#ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS
#include <arm_neon.h>
#ifdef WANT_NEON
/* Assumes that taps >= 8, and that taps is a multiple of 8. */
static void resampler_sinc_process_neon_intrin(void *re_, struct resampler_data *data)
{
rarch_sinc_resampler_t *resamp = (rarch_sinc_resampler_t*)re_;
unsigned phases = 1 << (resamp->phase_bits + resamp->subphase_bits);
uint32_t ratio = phases / data->ratio;
const float *input = data->data_in;
float *output = data->data_out;
size_t frames = data->input_frames;
size_t out_frames = 0;
while (frames)
{
while (resamp->time >= phases)
{
/* Push in reverse to make filter more obvious. */
if (!resamp->ptr)
resamp->ptr = resamp->taps;
resamp->ptr--;
resamp->buffer_l[resamp->ptr + resamp->taps] =
resamp->buffer_l[resamp->ptr] = *input++;
resamp->buffer_r[resamp->ptr + resamp->taps] =
resamp->buffer_r[resamp->ptr] = *input++;
resamp->time -= phases;
frames--;
}
{
const float *buffer_l = resamp->buffer_l + resamp->ptr;
const float *buffer_r = resamp->buffer_r + resamp->ptr;
unsigned taps = resamp->taps;
while (resamp->time < phases)
{
int i;
unsigned phase = resamp->time >> resamp->subphase_bits;
const float *phase_table = resamp->phase_table + phase * taps;
float32x4_t p1 = {0, 0, 0, 0}, p2 = {0, 0, 0, 0};
float32x2_t p3, p4;
for (i = 0; i < taps; i += 8)
{
float32x4x2_t coeff8 = vld2q_f32(&phase_table[i]);
float32x4x2_t left8 = vld2q_f32(&buffer_l[i]);
float32x4x2_t right8 = vld2q_f32(&buffer_r[i]);
p1 = vmlaq_f32(p1, left8.val[0], coeff8.val[0]);
p2 = vmlaq_f32(p2, right8.val[0], coeff8.val[0]);
p1 = vmlaq_f32(p1, left8.val[1], coeff8.val[1]);
p2 = vmlaq_f32(p2, right8.val[1], coeff8.val[1]);
}
p3 = vadd_f32(vget_low_f32(p1), vget_high_f32(p1));
p4 = vadd_f32(vget_low_f32(p2), vget_high_f32(p2));
vst1_f32(output, vpadd_f32(p3, p4));
output += 2;
out_frames++;
resamp->time += ratio;
}
}
}
data->output_frames = out_frames;
}
#else
/* Assumes that taps >= 8, and that taps is a multiple of 8. */
void process_sinc_neon_asm(float *out, const float *left,
const float *right, const float *coeff, unsigned taps);
@ -150,6 +215,7 @@ static void resampler_sinc_process_neon(void *re_, struct resampler_data *data)
data->output_frames = out_frames;
}
#endif
#endif
#if defined(__AVX__)
static void resampler_sinc_process_avx_kaiser(void *re_, struct resampler_data *data)
@ -844,7 +910,7 @@ static void *resampler_sinc_new(const struct resampler_config *config,
else
#endif
{
#if defined(WANT_NEON)
#if (defined(__ARM_NEON__) || defined(HAVE_NEON)
re->taps = (re->taps + 7) & ~7;
#else
re->taps = (re->taps + 3) & ~3;
@ -902,8 +968,12 @@ static void *resampler_sinc_new(const struct resampler_config *config,
}
else if (mask & RESAMPLER_SIMD_NEON && window_type != SINC_WINDOW_KAISER)
{
#if defined(WANT_NEON)
#if (defined(__ARM_NEON__) || defined(HAVE_NEON)
#ifdef DONT_WANT_ARM_ASM_OPTIMIZATIONS
sinc_resampler.process = resampler_sinc_process_neon_intrin;
#else
sinc_resampler.process = resampler_sinc_process_neon;
#endif
#endif
}
@ -922,5 +992,3 @@ retro_resampler_t sinc_resampler = {
"sinc",
"sinc"
};
#undef WANT_NEON