diff --git a/audio/resamplers/cc_resampler.c b/audio/resamplers/cc_resampler.c index 35a0488f1f..6910ec1a38 100644 --- a/audio/resamplers/cc_resampler.c +++ b/audio/resamplers/cc_resampler.c @@ -28,13 +28,8 @@ #define RARCH_LOG(...) fprintf(stderr, __VA_ARGS__) #endif -typedef struct audio_frame_int16 -{ - int16_t l; - int16_t r; -} audio_frame_int16_t; -#ifdef _MIPS_ARCH_ALLEGREX1 +#ifdef _MIPS_ARCH_ALLEGREX static void resampler_CC_process(void *re_, struct resampler_data *data) { (void)re_; @@ -121,7 +116,7 @@ static void resampler_CC_process(void *re_, struct resampler_data *data) outp++; } - /* The VFPU state is assumed to remain intact + /* The VFPU state is assumed to remain intact * in-between calls to resampler_CC_process. */ done: @@ -151,6 +146,251 @@ static void *resampler_CC_init(double bandwidth_mod) RARCH_LOG("\nConvoluted Cosine resampler (VFPU): \n"); return (void*)-1; } +#elif defined(__SSE__) + +/* uses a fast polynomial approximation + * since SSE lacks native support for trigonometric functions + * cc_int is approximated with P(X) = X - (3/4)*X^3 + (1/4)*X^5 + */ + + +#include + +#ifndef CC_RESAMPLER_PRECISION +#define CC_RESAMPLER_PRECISION 1 +#endif + +typedef struct rarch_CC_resampler +{ + __m128 previous; + __m128 current; + + float distance; + void (*process)(void *re, struct resampler_data *data); +} rarch_CC_resampler_t; + + +static void resampler_CC_downsample(void *re_, struct resampler_data *data) +{ + float ratio, b; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + + audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in; + audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames); + audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out; + + ratio = 1.0 / data->ratio; + b = data->ratio; /* cutoff frequency. */ + + __m128 vec_previous = _mm_loadu_ps((float*)&re->previous); + __m128 vec_current = _mm_loadu_ps((float*)&re->current); + + while (inp != inp_max) + { + __m128 vec_ratio = + _mm_mul_ps(_mm_set_ps1(ratio), _mm_set_ps(3.0, 2.0, 1.0, 0.0)); + __m128 vec_w = _mm_sub_ps(_mm_set_ps1(re->distance), vec_ratio); + + __m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5)); + __m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5)); + + __m128 vec_b = _mm_set_ps1(b); + vec_w1 = _mm_mul_ps(vec_w1, vec_b); + vec_w2 = _mm_mul_ps(vec_w2, vec_b); + +#if (CC_RESAMPLER_PRECISION > 0) + __m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1); + __m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2); + + + vec_ww1 = _mm_mul_ps(vec_ww1, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww1)); + vec_ww2 = _mm_mul_ps(vec_ww2, _mm_sub_ps(_mm_set_ps1(3.0),vec_ww2)); + + vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww1); + vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0/4.0), vec_ww2); + + vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1)); + vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2)); +#endif + + vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5)); + vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5)); + vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5)); + vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5)); + + vec_w = _mm_sub_ps(vec_w1, vec_w2); + + __m128 vec_w_previous = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0)); + __m128 vec_w_current = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2)); + + __m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp); + vec_in = _mm_shuffle_ps(vec_in,vec_in,_MM_SHUFFLE(1, 0, 1, 0)); + + vec_previous = + _mm_add_ps(vec_previous, _mm_mul_ps(vec_in, vec_w_previous)); + vec_current = + _mm_add_ps(vec_current, _mm_mul_ps(vec_in, vec_w_current)); + + re->distance++; + inp++; + + if (re->distance > (ratio + 0.5)) + { + _mm_storel_pi((__m64*)outp, vec_previous); + vec_previous = + _mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2)); + vec_current = + _mm_shuffle_ps(vec_current,_mm_setzero_ps(),_MM_SHUFFLE(1, 0, 3, 2)); + + re->distance -= ratio; + outp++; + } + } + + _mm_storeu_ps((float*)&re->previous, vec_previous); + _mm_storeu_ps((float*)&re->current, vec_current); + + data->output_frames = outp - (audio_frame_float_t*)data->data_out; +} + +#ifndef min +#define min(a, b) ((a) < (b) ? (a) : (b)) +#endif + +static void resampler_CC_upsample(void *re_, struct resampler_data *data) +{ + float b, ratio; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + + audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in; + audio_frame_float_t *inp_max = (audio_frame_float_t*)(inp + data->input_frames); + audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out; + + b = min(data->ratio, 1.00); /* cutoff frequency. */ + ratio = 1.0 / data->ratio; + + __m128 vec_previous = _mm_loadu_ps((float*)&re->previous); + __m128 vec_current = _mm_loadu_ps((float*)&re->current); + + + + while (inp != inp_max) + { + __m128 vec_in = _mm_loadl_pi(_mm_setzero_ps(),(__m64*)inp); + vec_previous = + _mm_shuffle_ps(vec_previous,vec_current,_MM_SHUFFLE(1, 0, 3, 2)); + vec_current = + _mm_shuffle_ps(vec_current,vec_in,_MM_SHUFFLE(1, 0, 3, 2)); + + while (re->distance < 1.0) + { + __m128 vec_w = + _mm_add_ps(_mm_set_ps1(re->distance), _mm_set_ps(-2.0, -1.0, 0.0, 1.0)); + + __m128 vec_w1 = _mm_add_ps(vec_w , _mm_set_ps1(0.5)); + __m128 vec_w2 = _mm_sub_ps(vec_w , _mm_set_ps1(0.5)); + + __m128 vec_b = _mm_set_ps1(b); + vec_w1 = _mm_mul_ps(vec_w1, vec_b); + vec_w2 = _mm_mul_ps(vec_w2, vec_b); + +#if (CC_RESAMPLER_PRECISION > 0) + __m128 vec_ww1 = _mm_mul_ps(vec_w1, vec_w1); + __m128 vec_ww2 = _mm_mul_ps(vec_w2, vec_w2); + + + vec_ww1 = _mm_mul_ps(vec_ww1,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww1)); + vec_ww2 = _mm_mul_ps(vec_ww2,_mm_sub_ps(_mm_set_ps1(3.0),vec_ww2)); + + vec_ww1 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww1); + vec_ww2 = _mm_mul_ps(_mm_set_ps1(1.0 / 4.0), vec_ww2); + + vec_w1 = _mm_mul_ps(vec_w1, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww1)); + vec_w2 = _mm_mul_ps(vec_w2, _mm_sub_ps(_mm_set_ps1(1.0), vec_ww2)); +#endif + + vec_w1 = _mm_min_ps(vec_w1, _mm_set_ps1( 0.5)); + vec_w2 = _mm_min_ps(vec_w2, _mm_set_ps1( 0.5)); + vec_w1 = _mm_max_ps(vec_w1, _mm_set_ps1(-0.5)); + vec_w2 = _mm_max_ps(vec_w2, _mm_set_ps1(-0.5)); + + vec_w = _mm_sub_ps(vec_w1, vec_w2); + + __m128 vec_w_previous = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(1, 1, 0, 0)); + __m128 vec_w_current = + _mm_shuffle_ps(vec_w,vec_w,_MM_SHUFFLE(3, 3, 2, 2)); + + __m128 vec_out = _mm_mul_ps(vec_previous, vec_w_previous); + vec_out = _mm_add_ps(vec_out, _mm_mul_ps(vec_current, vec_w_current)); + vec_out = + _mm_add_ps(vec_out, _mm_shuffle_ps(vec_out,vec_out,_MM_SHUFFLE(3, 2, 3, 2))); + + _mm_storel_pi((__m64*)outp,vec_out); + + re->distance += ratio; + outp++; + } + + re->distance -= 1.0; + inp++; + } + + _mm_storeu_ps((float*)&re->previous, vec_previous); + _mm_storeu_ps((float*)&re->current, vec_current); + + data->output_frames = outp - (audio_frame_float_t*)data->data_out; +} + + +static void resampler_CC_process(void *re_, struct resampler_data *data) +{ + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + re->process(re_, data); +} + +static void resampler_CC_free(void *re_) +{ + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_; + if (re) + free(re); +} + +static void *resampler_CC_init(double bandwidth_mod) +{ + int i; + rarch_CC_resampler_t *re = (rarch_CC_resampler_t*) + calloc(1, sizeof(rarch_CC_resampler_t)); + if (!re) + return NULL; + + for (i = 0; i < 4; i++) + { + re->previous = _mm_setzero_ps(); + re->current = _mm_setzero_ps(); + } + + RARCH_LOG("Convoluted Cosine resampler (SSE) : "); + + /* variations of data->ratio around 0.75 are safer + * than around 1.0 for both up/downsampler. */ + if (bandwidth_mod < 0.75) + { + RARCH_LOG("CC_downsample @%f \n", bandwidth_mod); + re->process = resampler_CC_downsample; + re->distance = 0.0; + } + else + { + RARCH_LOG("CC_upsample @%f \n", bandwidth_mod); + re->process = resampler_CC_upsample; + re->distance = 2.0; + } + + return re; +} #else /* C reference version. Not optimized. */ @@ -295,9 +535,9 @@ static void *resampler_CC_init(double bandwidth_mod) RARCH_LOG("Convoluted Cosine resampler (C) : "); - /* variations of data->ratio around 0.75 are safer + /* variations of data->ratio around 0.75 are safer * than around 1.0 for both up/downsampler. */ - if (bandwidth_mod < 0.75) + if (bandwidth_mod < 0.75) { RARCH_LOG("CC_downsample @%f \n", bandwidth_mod); re->process = resampler_CC_downsample; diff --git a/audio/test/Makefile b/audio/test/Makefile index a5838ab00f..197621189b 100644 --- a/audio/test/Makefile +++ b/audio/test/Makefile @@ -11,69 +11,80 @@ TESTS := test-sinc-lowest \ test-cc \ test-snr-cc -CFLAGS += -O3 -ffast-math -g -Wall -pedantic -march=native -std=gnu99 -DRESAMPLER_TEST -DRARCH_DUMMY_LOG +CFLAGS += -O3 -ffast-math -g -Wall -pedantic -march=native -std=gnu99 +CFLAGS += -DRESAMPLER_TEST -DRARCH_DUMMY_LOG + LDFLAGS += -lm all: $(TESTS) -resampler-sinc.o: ../resampler.c +resampler-sinc.o: ../resamplers/resampler.c $(CC) -c -o $@ $< $(CFLAGS) -resampler-cc.o: ../resampler.c - $(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER +resampler-cc.o: ../resamplers/resampler.c + $(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"' -cc-resampler.o: ../cc_resampler.c +main-cc.o: main.c + $(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"' + +snr-cc.o: snr.c + $(CC) -c -o $@ $< $(CFLAGS) -DHAVE_CC_RESAMPLER -DRESAMPLER_IDENT='"CC"' + +cc-resampler.o: ../resamplers/cc_resampler.c $(CC) -c -o $@ $< $(CFLAGS) -sinc-lowest.o: ../sinc.c +sinc-lowest.o: ../resamplers/sinc.c $(CC) -c -o $@ $< $(CFLAGS) -DSINC_LOWEST_QUALITY -sinc-lower.o: ../sinc.c +sinc-lower.o: ../resamplers/sinc.c $(CC) -c -o $@ $< $(CFLAGS) -DSINC_LOWER_QUALITY -sinc.o: ../sinc.c +sinc.o: ../resamplers/sinc.c $(CC) -c -o $@ $< $(CFLAGS) -sinc-higher.o: ../sinc.c +nearest.o: ../resamplers/nearest.c + $(CC) -c -o $@ $< $(CFLAGS) + +sinc-higher.o: ../resamplers/sinc.c $(CC) -c -o $@ $< $(CFLAGS) -DSINC_HIGHER_QUALITY -sinc-highest.o: ../sinc.c +sinc-highest.o: ../resamplers/sinc.c $(CC) -c -o $@ $< $(CFLAGS) -DSINC_HIGHEST_QUALITY -test-sinc-lowest: sinc-lowest.o ../utils.o main.o resampler-sinc.o +test-sinc-lowest: sinc-lowest.o ../utils.o main.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-sinc-lowest: sinc-lowest.o ../utils.o snr.o resampler-sinc.o +test-snr-sinc-lowest: sinc-lowest.o ../utils.o snr.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-sinc-lower: sinc-lower.o ../utils.o main.o resampler-sinc.o +test-sinc-lower: sinc-lower.o ../utils.o main.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-sinc-lower: sinc-lower.o ../utils.o snr.o resampler-sinc.o +test-snr-sinc-lower: sinc-lower.o ../utils.o snr.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-sinc: sinc.o ../utils.o main.o resampler-sinc.o +test-sinc: sinc.o ../utils.o main.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-sinc: sinc.o ../utils.o snr.o resampler-sinc.o +test-snr-sinc: sinc.o ../utils.o snr.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-sinc-higher: sinc-higher.o ../utils.o main.o resampler-sinc.o +test-sinc-higher: sinc-higher.o ../utils.o main.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-sinc-higher: sinc-higher.o ../utils.o snr.o resampler-sinc.o +test-snr-sinc-higher: sinc-higher.o ../utils.o snr.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-sinc-highest: sinc-highest.o ../utils.o main.o resampler-sinc.o +test-sinc-highest: sinc-highest.o ../utils.o main.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-sinc-highest: sinc-highest.o ../utils.o snr.o resampler-sinc.o +test-snr-sinc-highest: sinc-highest.o ../utils.o snr.o resampler-sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-cc: cc-resampler.o ../utils.o main.o resampler-cc.o sinc.o +test-cc: cc-resampler.o ../utils.o main-cc.o resampler-cc.o sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) -test-snr-cc: cc-resampler.o ../utils.o snr.o resampler-cc.o sinc.o +test-snr-cc: cc-resampler.o ../utils.o snr-cc.o resampler-cc.o sinc.o nearest.o $(CC) -o $@ $^ $(LDFLAGS) %.o: %.c diff --git a/audio/test/main.c b/audio/test/main.c index 5930bdcb9f..fe04bebe15 100644 --- a/audio/test/main.c +++ b/audio/test/main.c @@ -1,6 +1,6 @@ /* RetroArch - A frontend for libretro. * Copyright (C) 2010-2014 - Hans-Kristian Arntzen - * + * * RetroArch is free software: you can redistribute it and/or modify it under the terms * of the GNU General Public License as published by the Free Software Found- * ation, either version 3 of the License, or (at your option) any later version. @@ -16,12 +16,16 @@ // Resampler that reads raw S16NE/stereo from stdin and outputs to stdout in S16NE/stereo. // Used for testing and performance benchmarking. -#include "../resampler.h" +#include "../resamplers/resampler.h" #include "../utils.h" #include #include #include +#ifndef RESAMPLER_IDENT +#define RESAMPLER_IDENT "sinc" +#endif + int main(int argc, char *argv[]) { srand(time(NULL)); @@ -56,7 +60,7 @@ int main(int argc, char *argv[]) const rarch_resampler_t *resampler = NULL; void *re = NULL; - if (!rarch_resampler_realloc(&re, &resampler, NULL, out_rate / in_rate)) + if (!rarch_resampler_realloc(&re, &resampler, RESAMPLER_IDENT, out_rate / in_rate)) { fprintf(stderr, "Failed to allocate resampler ...\n"); return 1; diff --git a/audio/test/snr.c b/audio/test/snr.c index 994d83384d..565c417010 100644 --- a/audio/test/snr.c +++ b/audio/test/snr.c @@ -1,6 +1,6 @@ /* RetroArch - A frontend for libretro. * Copyright (C) 2010-2014 - Hans-Kristian Arntzen - * + * * RetroArch is free software: you can redistribute it and/or modify it under the terms * of the GNU General Public License as published by the Free Software Found- * ation, either version 3 of the License, or (at your option) any later version. @@ -13,7 +13,7 @@ * If not, see . */ -#include "../resampler.h" +#include "../resamplers/resampler.h" #include "../utils.h" #include #include @@ -23,6 +23,10 @@ #include #include +#ifndef RESAMPLER_IDENT +#define RESAMPLER_IDENT "sinc" +#endif + #undef min #define min(a, b) (((a) < (b)) ? (a) : (b)) @@ -63,7 +67,7 @@ static unsigned bitswap(unsigned i, unsigned range) } // When interleaving the butterfly buffer, addressing puts bits in reverse. -// [0, 1, 2, 3, 4, 5, 6, 7] => [0, 4, 2, 6, 1, 5, 3, 7] +// [0, 1, 2, 3, 4, 5, 6, 7] => [0, 4, 2, 6, 1, 5, 3, 7] static void interleave(complex double *butterfly_buf, size_t samples) { unsigned range = bitrange(samples); @@ -269,7 +273,7 @@ int main(int argc, char *argv[]) void *re = NULL; const rarch_resampler_t *resampler = NULL; - if (!rarch_resampler_realloc(&re, &resampler, NULL, ratio)) + if (!rarch_resampler_realloc(&re, &resampler, RESAMPLER_IDENT, ratio)) return 1; test_fft();