Merge pull request #1035 from aliaspider/master

add an SSE optimized version of the CC resampler
This commit is contained in:
Twinaphex 2014-09-17 20:18:22 +02:00
commit 8f5e3c55ab
4 changed files with 297 additions and 38 deletions

View File

@ -28,13 +28,8 @@
#define RARCH_LOG(...) fprintf(stderr, __VA_ARGS__)
#endif
typedef struct audio_frame_int16
{
int16_t l;
int16_t r;
} audio_frame_int16_t;
#ifdef _MIPS_ARCH_ALLEGREX1
#ifdef _MIPS_ARCH_ALLEGREX
static void resampler_CC_process(void *re_, struct resampler_data *data)
{
(void)re_;
@ -121,7 +116,7 @@ static void resampler_CC_process(void *re_, struct resampler_data *data)
outp++;
}
/* The VFPU state is assumed to remain intact
/* The VFPU state is assumed to remain intact
* in-between calls to resampler_CC_process. */
done:
@ -151,6 +146,251 @@ static void *resampler_CC_init(double bandwidth_mod)
RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n");
return (void*)-1;
}
#elif defined(__SSE__)
/* uses a fast polynomial approximation
* since SSE lacks native support for trigonometric functions
* cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5
*/
#include <xmmintrin.h>
#ifndef CC_RESAMPLER_PRECISION
#define CC_RESAMPLER_PRECISION 1
#endif
typedef struct rarch_CC_resampler
{
__m128 previous;
__m128 current;
float distance;
void (*process)(void *re, struct resampler_data *data);
} rarch_CC_resampler_t;
static void resampler_CC_downsample(void *re_, struct resampler_data *data)
{
float ratio, b;
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in;
audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames);
audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out;
ratio = 1.0 / data->ratio;
b = data->ratio; /* cutoff frequency. */
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
while (inp != inp_max)
{
__m128 vec_ratio =
_mm_mul_ps(_mm_set_ps1(ratio), _mm_set_ps(3.0, 2.0, 1.0, 0.0));
__m128 vec_w = _mm_sub_ps(_mm_set_ps1(re->distance), vec_ratio);
__m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5));
__m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5));
__m128 vec_b = _mm_set_ps1(b);
vec_w1 = _mm_mul_ps(vec_w1, vec_b);
vec_w2 = _mm_mul_ps(vec_w2, vec_b);
#if (CC_RESAMPLER_PRECISION > 0)
__m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1);
__m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2);
vec_ww1 = _mm_mul_ps(vec_ww1, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww1));
vec_ww2 = _mm_mul_ps(vec_ww2, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww2));
vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww1);
vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww2);
vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1));
vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2));
#endif
vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5));
vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5));
vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5));
vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5));
vec_w = _mm_sub_ps(vec_w1, vec_w2);
__m128 vec_w_previous =
_mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0));
__m128 vec_w_current =
_mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2));
__m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp);
vec_in = _mm_shuffle_ps(vec_in,vec_in,_MM_SHUFFLE(1, 0, 1, 0));
vec_previous =
_mm_add_ps(vec_previous, _mm_mul_ps(vec_in, vec_w_previous));
vec_current =
_mm_add_ps(vec_current, _mm_mul_ps(vec_in, vec_w_current));
re->distance++;
inp++;
if (re->distance > (ratio + 0.5))
{
_mm_storel_pi((__m64*)outp, vec_previous);
vec_previous =
_mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2));
vec_current =
_mm_shuffle_ps(vec_current,_mm_setzero_ps(),_MM_SHUFFLE(1, 0, 3, 2));
re->distance -= ratio;
outp++;
}
}
_mm_storeu_ps((float*)&re->previous, vec_previous);
_mm_storeu_ps((float*)&re->current, vec_current);
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
#ifndef min
#define min(a, b) ((a) < (b) ? (a) : (b))
#endif
static void resampler_CC_upsample(void *re_, struct resampler_data *data)
{
float b, ratio;
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in;
audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames);
audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out;
b = min(data->ratio, 1.00); /* cutoff frequency. */
ratio = 1.0 / data->ratio;
__m128 vec_previous = _mm_loadu_ps((float*)&re->previous);
__m128 vec_current = _mm_loadu_ps((float*)&re->current);
while (inp != inp_max)
{
__m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp);
vec_previous =
_mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2));
vec_current =
_mm_shuffle_ps(vec_current,vec_in,_MM_SHUFFLE(1, 0, 3, 2));
while (re->distance < 1.0)
{
__m128 vec_w =
_mm_add_ps(_mm_set_ps1(re->distance), _mm_set_ps(-2.0, -1.0, 0.0, 1.0));
__m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5));
__m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5));
__m128 vec_b = _mm_set_ps1(b);
vec_w1 = _mm_mul_ps(vec_w1, vec_b);
vec_w2 = _mm_mul_ps(vec_w2, vec_b);
#if (CC_RESAMPLER_PRECISION > 0)
__m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1);
__m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2);
vec_ww1 = _mm_mul_ps(vec_ww1,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww1));
vec_ww2 = _mm_mul_ps(vec_ww2,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww2));
vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww1);
vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww2);
vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1));
vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2));
#endif
vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5));
vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5));
vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5));
vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5));
vec_w = _mm_sub_ps(vec_w1, vec_w2);
__m128 vec_w_previous =
_mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0));
__m128 vec_w_current =
_mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2));
__m128 vec_out = _mm_mul_ps(vec_previous, vec_w_previous);
vec_out = _mm_add_ps(vec_out, _mm_mul_ps(vec_current, vec_w_current));
vec_out =
_mm_add_ps(vec_out, _mm_shuffle_ps(vec_out,vec_out,_MM_SHUFFLE(3, 2, 3, 2)));
_mm_storel_pi((__m64*)outp,vec_out);
re->distance += ratio;
outp++;
}
re->distance -= 1.0;
inp++;
}
_mm_storeu_ps((float*)&re->previous, vec_previous);
_mm_storeu_ps((float*)&re->current, vec_current);
data->output_frames = outp - (audio_frame_float_t*)data->data_out;
}
static void resampler_CC_process(void *re_, struct resampler_data *data)
{
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
re->process(re_, data);
}
static void resampler_CC_free(void *re_)
{
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
if (re)
free(re);
}
static void *resampler_CC_init(double bandwidth_mod)
{
int i;
rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)
calloc(1, sizeof(rarch_CC_resampler_t));
if (!re)
return NULL;
for (i = 0; i < 4; i++)
{
re->previous = _mm_setzero_ps();
re->current = _mm_setzero_ps();
}
RARCH_LOG("Convoluted Cosine resampler (SSE) : ");
/* variations of data->ratio around 0.75 are safer
* than around 1.0 for both up/downsampler. */
if (bandwidth_mod < 0.75)
{
RARCH_LOG("CC_downsample @%f \n", bandwidth_mod);
re->process = resampler_CC_downsample;
re->distance = 0.0;
}
else
{
RARCH_LOG("CC_upsample @%f \n", bandwidth_mod);
re->process = resampler_CC_upsample;
re->distance = 2.0;
}
return re;
}
#else
/* C reference version. Not optimized. */
@ -295,9 +535,9 @@ static void *resampler_CC_init(double bandwidth_mod)
RARCH_LOG("Convoluted Cosine resampler (C) : ");
/* variations of data->ratio around 0.75 are safer
/* variations of data->ratio around 0.75 are safer
* than around 1.0 for both up/downsampler. */
if (bandwidth_mod < 0.75)
if (bandwidth_mod < 0.75)
{
RARCH_LOG("CC_downsample @%f \n", bandwidth_mod);
re->process = resampler_CC_downsample;

View File

@ -11,69 +11,80 @@ TESTS := test-sinc-lowest \
test-cc \
test-snr-cc
CFLAGS += -O3 -ffast-math -g -Wall -pedantic -march=native -std=gnu99 -DRESAMPLER_TEST -DRARCH_DUMMY_LOG
CFLAGS += -O3 -ffast-math -g -Wall -pedantic -march=native -std=gnu99
CFLAGS += -DRESAMPLER_TEST -DRARCH_DUMMY_LOG
LDFLAGS += -lm
all: $(TESTS)
resampler-sinc.o: ../resampler.c
resampler-sinc.o: ../resamplers/resampler.c
$(CC) -c -o $@ $< $(CFLAGS)
resampler-cc.o: ../resampler.c
$(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER
resampler-cc.o: ../resamplers/resampler.c
$(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"'
cc-resampler.o: ../cc_resampler.c
main-cc.o: main.c
$(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"'
snr-cc.o: snr.c
$(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"'
cc-resampler.o: ../resamplers/cc_resampler.c
$(CC) -c -o $@ $< $(CFLAGS)
sinc-lowest.o: ../sinc.c
sinc-lowest.o: ../resamplers/sinc.c
$(CC) -c -o $@ $< $(CFLAGS) -DSINC_LOWEST_QUALITY
sinc-lower.o: ../sinc.c
sinc-lower.o: ../resamplers/sinc.c
$(CC) -c -o $@ $< $(CFLAGS) -DSINC_LOWER_QUALITY
sinc.o: ../sinc.c
sinc.o: ../resamplers/sinc.c
$(CC) -c -o $@ $< $(CFLAGS)
sinc-higher.o: ../sinc.c
nearest.o: ../resamplers/nearest.c
$(CC) -c -o $@ $< $(CFLAGS)
sinc-higher.o: ../resamplers/sinc.c
$(CC) -c -o $@ $< $(CFLAGS) -DSINC_HIGHER_QUALITY
sinc-highest.o: ../sinc.c
sinc-highest.o: ../resamplers/sinc.c
$(CC) -c -o $@ $< $(CFLAGS) -DSINC_HIGHEST_QUALITY
test-sinc-lowest: sinc-lowest.o ../utils.o main.o resampler-sinc.o
test-sinc-lowest: sinc-lowest.o ../utils.o main.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-sinc-lowest: sinc-lowest.o ../utils.o snr.o resampler-sinc.o
test-snr-sinc-lowest: sinc-lowest.o ../utils.o snr.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-sinc-lower: sinc-lower.o ../utils.o main.o resampler-sinc.o
test-sinc-lower: sinc-lower.o ../utils.o main.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-sinc-lower: sinc-lower.o ../utils.o snr.o resampler-sinc.o
test-snr-sinc-lower: sinc-lower.o ../utils.o snr.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-sinc: sinc.o ../utils.o main.o resampler-sinc.o
test-sinc: sinc.o ../utils.o main.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-sinc: sinc.o ../utils.o snr.o resampler-sinc.o
test-snr-sinc: sinc.o ../utils.o snr.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-sinc-higher: sinc-higher.o ../utils.o main.o resampler-sinc.o
test-sinc-higher: sinc-higher.o ../utils.o main.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-sinc-higher: sinc-higher.o ../utils.o snr.o resampler-sinc.o
test-snr-sinc-higher: sinc-higher.o ../utils.o snr.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-sinc-highest: sinc-highest.o ../utils.o main.o resampler-sinc.o
test-sinc-highest: sinc-highest.o ../utils.o main.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-sinc-highest: sinc-highest.o ../utils.o snr.o resampler-sinc.o
test-snr-sinc-highest: sinc-highest.o ../utils.o snr.o resampler-sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-cc: cc-resampler.o ../utils.o main.o resampler-cc.o sinc.o
test-cc: cc-resampler.o ../utils.o main-cc.o resampler-cc.o sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
test-snr-cc: cc-resampler.o ../utils.o snr.o resampler-cc.o sinc.o
test-snr-cc: cc-resampler.o ../utils.o snr-cc.o resampler-cc.o sinc.o nearest.o
$(CC) -o $@ $^ $(LDFLAGS)
%.o: %.c

View File

@ -1,6 +1,6 @@
/* RetroArch - A frontend for libretro.
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
*
*
* RetroArch is free software: you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
@ -16,12 +16,16 @@
// Resampler that reads raw S16NE/stereo from stdin and outputs to stdout in S16NE/stereo.
// Used for testing and performance benchmarking.
#include "../resampler.h"
#include "../resamplers/resampler.h"
#include "../utils.h"
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifndef RESAMPLER_IDENT
#define RESAMPLER_IDENT "sinc"
#endif
int main(int argc, char *argv[])
{
srand(time(NULL));
@ -56,7 +60,7 @@ int main(int argc, char *argv[])
const rarch_resampler_t *resampler = NULL;
void *re = NULL;
if (!rarch_resampler_realloc(&re, &resampler, NULL, out_rate / in_rate))
if (!rarch_resampler_realloc(&re, &resampler, RESAMPLER_IDENT, out_rate / in_rate))
{
fprintf(stderr, "Failed to allocate resampler ...\n");
return 1;

View File

@ -1,6 +1,6 @@
/* RetroArch - A frontend for libretro.
* Copyright (C) 2010-2014 - Hans-Kristian Arntzen
*
*
* RetroArch is free software: you can redistribute it and/or modify it under the terms
* of the GNU General Public License as published by the Free Software Found-
* ation, either version 3 of the License, or (at your option) any later version.
@ -13,7 +13,7 @@
* If not, see <http://www.gnu.org/licenses/>.
*/
#include "../resampler.h"
#include "../resamplers/resampler.h"
#include "../utils.h"
#include <stdio.h>
#include <stdlib.h>
@ -23,6 +23,10 @@
#include <assert.h>
#include <stdbool.h>
#ifndef RESAMPLER_IDENT
#define RESAMPLER_IDENT "sinc"
#endif
#undef min
#define min(a, b) (((a) < (b)) ? (a) : (b))
@ -63,7 +67,7 @@ static unsigned bitswap(unsigned i, unsigned range)
}
// When interleaving the butterfly buffer, addressing puts bits in reverse.
// [0, 1, 2, 3, 4, 5, 6, 7] => [0, 4, 2, 6, 1, 5, 3, 7]
// [0, 1, 2, 3, 4, 5, 6, 7] => [0, 4, 2, 6, 1, 5, 3, 7]
static void interleave(complex double *butterfly_buf, size_t samples)
{
unsigned range = bitrange(samples);
@ -269,7 +273,7 @@ int main(int argc, char *argv[])
void *re = NULL;
const rarch_resampler_t *resampler = NULL;
if (!rarch_resampler_realloc(&re, &resampler, NULL, ratio))
if (!rarch_resampler_realloc(&re, &resampler, RESAMPLER_IDENT, ratio))
return 1;
test_fft();