diff --git a/driver.c b/driver.c index a4fafb86c7..2b93e9c66b 100644 --- a/driver.c +++ b/driver.c @@ -423,19 +423,32 @@ void uninit_audio(void) } #ifdef HAVE_DYLIB -static void init_filter(void) +static void deinit_filter(void) +{ + g_extern.filter.active = false; + + if (g_extern.filter.lib) + dylib_close(g_extern.filter.lib); + g_extern.filter.lib = NULL; + + free(g_extern.filter.buffer); + free(g_extern.filter.colormap); + free(g_extern.filter.scaler_out); + g_extern.filter.buffer = NULL; + g_extern.filter.colormap = NULL; + g_extern.filter.scaler_out = NULL; + + scaler_ctx_gen_reset(&g_extern.filter.scaler); + memset(&g_extern.filter.scaler, 0, sizeof(g_extern.filter.scaler)); +} + +static void init_filter(bool rgb32) { if (g_extern.filter.active) return; - if (*g_settings.video.filter_path == '\0') + if (!*g_settings.video.filter_path) return; - if (g_extern.system.pix_fmt != RETRO_PIXEL_FORMAT_0RGB1555) - { - RARCH_WARN("CPU filters only support 0RGB1555.\n"); - return; - } - RARCH_LOG("Loading bSNES filter from \"%s\"\n", g_settings.video.filter_path); g_extern.filter.lib = dylib_load(g_settings.video.filter_path); if (!g_extern.filter.lib) @@ -454,9 +467,7 @@ static void init_filter(void) if (!g_extern.filter.psize || !g_extern.filter.prender) { RARCH_ERR("Failed to find functions in filter...\n"); - dylib_close(g_extern.filter.lib); - g_extern.filter.lib = NULL; - return; + goto error; } g_extern.filter.active = true; @@ -473,12 +484,14 @@ static void init_filter(void) g_extern.filter.buffer = (uint32_t*)malloc(RARCH_SCALE_BASE * RARCH_SCALE_BASE * g_extern.filter.scale * g_extern.filter.scale * sizeof(uint32_t)); - rarch_assert(g_extern.filter.buffer); + if (!g_extern.filter.buffer) + goto error; g_extern.filter.pitch = RARCH_SCALE_BASE * g_extern.filter.scale * sizeof(uint32_t); g_extern.filter.colormap = (uint32_t*)malloc(0x10000 * sizeof(uint32_t)); - rarch_assert(g_extern.filter.colormap); + if (!g_extern.filter.colormap) + goto error; // Set up conversion map from 16-bit XRGB1555 to 32-bit ARGB. for (unsigned i = 0; i < 0x10000; i++) @@ -492,18 +505,23 @@ static void init_filter(void) b = (b << 3) | (b >> 2); g_extern.filter.colormap[i] = (r << 16) | (g << 8) | (b << 0); } -} -static void deinit_filter(void) -{ - if (!g_extern.filter.active) - return; + g_extern.filter.scaler_out = (uint16_t*)malloc(sizeof(uint16_t) * geom->max_width * geom->max_height); + if (!g_extern.filter.scaler_out) + goto error; - g_extern.filter.active = false; - dylib_close(g_extern.filter.lib); - g_extern.filter.lib = NULL; - free(g_extern.filter.buffer); - free(g_extern.filter.colormap); + g_extern.filter.scaler.scaler_type = SCALER_TYPE_POINT; + g_extern.filter.scaler.in_fmt = rgb32 ? SCALER_FMT_ARGB8888 : SCALER_FMT_RGB565; + g_extern.filter.scaler.out_fmt = SCALER_FMT_0RGB1555; + + if (!scaler_ctx_gen_filter(&g_extern.filter.scaler)) + goto error; + + return; + +error: + RARCH_ERR("CPU filter init failed.\n"); + deinit_filter(); } #endif @@ -542,13 +560,6 @@ static bool init_video_pixel_converter(unsigned size) { RARCH_WARN("0RGB1555 pixel format is deprecated, and will be slower. For 15/16-bit, RGB565 format is preferred.\n"); - // We'll tweak these values later, - // just set most of them to something sane to begin with. - driver.scaler.in_width = - driver.scaler.in_height = - driver.scaler.out_width = - driver.scaler.out_height = size; - driver.scaler.scaler_type = SCALER_TYPE_POINT; driver.scaler.in_fmt = SCALER_FMT_0RGB1555; @@ -567,7 +578,7 @@ static bool init_video_pixel_converter(unsigned size) void init_video_input(void) { #ifdef HAVE_DYLIB - init_filter(); + init_filter(g_extern.system.pix_fmt == RETRO_PIXEL_FORMAT_XRGB8888); #endif #ifdef HAVE_XML diff --git a/general.h b/general.h index 822e70acba..879bfd6a18 100644 --- a/general.h +++ b/general.h @@ -355,6 +355,10 @@ struct global void (*psize)(unsigned *width, unsigned *height); void (*prender)(uint32_t *colormap, uint32_t *output, unsigned outpitch, const uint16_t *input, unsigned pitch, unsigned width, unsigned height); + + // CPU filters only work on *XRGB1555*. We have to convert to XRGB1555 first. + struct scaler_ctx scaler; + void *scaler_out; } filter; msg_queue_t *msg_queue; diff --git a/gfx/scaler/pixconv.c b/gfx/scaler/pixconv.c index 5318b9ef80..4ac0e8297f 100644 --- a/gfx/scaler/pixconv.c +++ b/gfx/scaler/pixconv.c @@ -27,6 +27,61 @@ #include #endif +#if defined(__SSE2_) +void conv_rgb565_0rgb1555(void *output_, const void *input_, + int width, int height, + int out_stride, int in_stride) +{ + const uint16_t *input = (const uint16_t*)input_; + uint16_t *output = (uint16_t*)output_; + + int max_width = width - 7; + + const __m128i hi_mask = _mm_set1_epi16(0x7fe0); + const __m128i lo_mask = _mm_set1_epi16(0x1f); + + for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 1) + { + int w; + for (w = 0; w < max_width; w += 8) + { + const __m128i in = _mm_loadu_si128((const __m128i*)(input + w)); + __m128i hi = _mm_and_si128(_mm_slli_epi16(in, 1), hi_mask); + __m128i lo = _mm_and_si128(in, lo_mask); + _mm_storeu_si128((__m128i*)(output + w), _mm_or_si128(hi, lo)); + } + + for (; w < width; w++) + { + uint16_t col = input[w]; + uint16_t hi = (col >> 1) & 0x7fe0; + uint16_t lo = col & 0x1f; + output[w] = hi | lo; + } + } +} +#else +void conv_rgb565_0rgb1555(void *output_, const void *input_, + int width, int height, + int out_stride, int in_stride) +{ + const uint16_t *input = (const uint16_t*)input_; + uint16_t *output = (uint16_t*)output_; + + for (int h = 0; h < height; h++, output += out_stride >> 1, input += in_stride >> 1) + { + for (int w = 0; w < width; w++) + { + uint16_t col = input[w]; + uint16_t hi = (col >> 1) & 0x7fe0; + uint16_t lo = col & 0x1f; + output[w] = hi | lo; + } + } +} + +#endif + #if defined(__SSE2__) void conv_0rgb1555_rgb565(void *output_, const void *input_, int width, int height, diff --git a/gfx/scaler/pixconv.h b/gfx/scaler/pixconv.h index 9cdb7d9182..0070969300 100644 --- a/gfx/scaler/pixconv.h +++ b/gfx/scaler/pixconv.h @@ -24,6 +24,10 @@ void conv_0rgb1555_rgb565(void *output, const void *input, int width, int height, int out_stride, int in_stride); +void conv_rgb565_0rgb1555(void *output, const void *input, + int width, int height, + int out_stride, int in_stride); + void conv_rgb565_argb8888(void *output, const void *input, int width, int height, int out_stride, int in_stride); diff --git a/gfx/scaler/scaler.c b/gfx/scaler/scaler.c index 0ea0f7ffc2..bd92a7c95b 100644 --- a/gfx/scaler/scaler.c +++ b/gfx/scaler/scaler.c @@ -74,6 +74,8 @@ static bool set_direct_pix_conv(struct scaler_ctx *ctx) ctx->direct_pixconv = conv_rgb565_bgr24; else if (ctx->in_fmt == SCALER_FMT_0RGB1555 && ctx->out_fmt == SCALER_FMT_RGB565) ctx->direct_pixconv = conv_0rgb1555_rgb565; + else if (ctx->in_fmt == SCALER_FMT_RGB565 && ctx->out_fmt == SCALER_FMT_0RGB1555) + ctx->direct_pixconv = conv_rgb565_0rgb1555; else if (ctx->in_fmt == SCALER_FMT_BGR24 && ctx->out_fmt == SCALER_FMT_ARGB8888) ctx->direct_pixconv = conv_bgr24_argb8888; else if (ctx->in_fmt == SCALER_FMT_ARGB8888 && ctx->out_fmt == SCALER_FMT_0RGB1555) diff --git a/retroarch.c b/retroarch.c index 0a804e3754..14b937662f 100644 --- a/retroarch.c +++ b/retroarch.c @@ -283,11 +283,19 @@ static void video_frame(const void *data, unsigned width, unsigned height, size_ #ifdef HAVE_DYLIB if (g_extern.filter.active && data) { + struct scaler_ctx *scaler = &g_extern.filter.scaler; + scaler->in_width = scaler->out_width = width; + scaler->in_height = scaler->out_height = height; + scaler->in_stride = pitch; + scaler->out_stride = width * sizeof(uint16_t); + + scaler_ctx_scale(scaler, g_extern.filter.scaler_out, data); + unsigned owidth = width; unsigned oheight = height; g_extern.filter.psize(&owidth, &oheight); g_extern.filter.prender(g_extern.filter.colormap, g_extern.filter.buffer, - g_extern.filter.pitch, (const uint16_t*)data, pitch, width, height); + g_extern.filter.pitch, g_extern.filter.scaler_out, scaler->out_stride, width, height); #ifdef HAVE_FFMPEG if (g_extern.recording && g_settings.video.post_filter_record)