mirror of
https://github.com/libretro/RetroArch
synced 2025-04-17 02:43:03 +00:00
Optimize -> bgr24 conversion paths.
This commit is contained in:
parent
b9630e2a2c
commit
ebd5bc85d5
9
Makefile
9
Makefile
@ -130,6 +130,14 @@ ifeq ($(HAVE_SDL), 1)
|
||||
DEFINES += $(SDL_CFLAGS) $(BSD_LOCAL_INC)
|
||||
LIBS += $(SDL_LIBS)
|
||||
|
||||
ifeq ($(SCALER_NO_SIMD), 1)
|
||||
DEFINES += -DSCALER_NO_SIMD
|
||||
endif
|
||||
ifeq ($(SCALER_PERF), 1)
|
||||
DEFINES += -DSCALER_PERF
|
||||
LIBS += -lrt
|
||||
endif
|
||||
|
||||
ifeq ($(HAVE_X11), 1)
|
||||
LIBS += $(X11_LIBS)
|
||||
DEFINES += $(X11_CFLAGS)
|
||||
@ -298,6 +306,7 @@ clean:
|
||||
rm -f gfx/fonts/*.o
|
||||
rm -f gfx/context/*.o
|
||||
rm -f gfx/py_state/*.o
|
||||
rm -f gfx/scaler/*.o
|
||||
rm -f compat/*.o
|
||||
rm -f record/*.o
|
||||
rm -f input/*.o
|
||||
|
@ -218,6 +218,7 @@ clean:
|
||||
rm -f audio/xaudio-c/*.o
|
||||
rm -f compat/*.o
|
||||
rm -f conf/*.o
|
||||
rm -f gfx/scaler/*.o
|
||||
rm -f gfx/*.o
|
||||
rm -f gfx/context/*.o
|
||||
rm -f gfx/fonts/*.o
|
||||
|
@ -19,6 +19,10 @@
|
||||
#include <string.h>
|
||||
#include <stdio.h>
|
||||
|
||||
#ifdef SCALER_NO_SIMD
|
||||
#undef __SSE2__
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
@ -31,11 +35,11 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
|
||||
const uint16_t *input = (const uint16_t*)input_;
|
||||
uint32_t *output = (uint32_t*)output_;
|
||||
|
||||
__m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
|
||||
__m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
|
||||
__m128i mul15_mid = _mm_set1_epi16(0x4200);
|
||||
__m128i mul15_hi = _mm_set1_epi16(0x0210);
|
||||
__m128i a = _mm_set1_epi16(0x00ff);
|
||||
const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
|
||||
const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
|
||||
const __m128i mul15_mid = _mm_set1_epi16(0x4200);
|
||||
const __m128i mul15_hi = _mm_set1_epi16(0x0210);
|
||||
const __m128i a = _mm_set1_epi16(0x00ff);
|
||||
|
||||
int max_width = width - 7;
|
||||
|
||||
@ -44,7 +48,7 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
|
||||
int w;
|
||||
for (w = 0; w < max_width; w += 8)
|
||||
{
|
||||
__m128i in = _mm_loadu_si128((const __m128i*)(input + w));
|
||||
const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
|
||||
__m128i r = _mm_and_si128(in, pix_mask_r);
|
||||
__m128i g = _mm_and_si128(in, pix_mask_gb);
|
||||
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_gb);
|
||||
@ -105,6 +109,121 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
// :( TODO: Make this saner.
|
||||
static inline void store_bgr24_sse2(void *output, __m128i a, __m128i b, __m128i c, __m128i d)
|
||||
{
|
||||
const __m128i mask_0 = _mm_set_epi32(0, 0, 0, 0x00ffffff);
|
||||
const __m128i mask_1 = _mm_set_epi32(0, 0, 0x00ffffff, 0);
|
||||
const __m128i mask_2 = _mm_set_epi32(0, 0x00ffffff, 0, 0);
|
||||
const __m128i mask_3 = _mm_set_epi32(0x00ffffff, 0, 0, 0);
|
||||
|
||||
__m128i a0 = _mm_and_si128(a, mask_0);
|
||||
__m128i a1 = _mm_srli_si128(_mm_and_si128(a, mask_1), 1);
|
||||
__m128i a2 = _mm_srli_si128(_mm_and_si128(a, mask_2), 2);
|
||||
__m128i a3 = _mm_srli_si128(_mm_and_si128(a, mask_3), 3);
|
||||
__m128i a4 = _mm_slli_si128(_mm_and_si128(b, mask_0), 12);
|
||||
__m128i a5 = _mm_slli_si128(_mm_and_si128(b, mask_1), 11);
|
||||
|
||||
__m128i b0 = _mm_srli_si128(_mm_and_si128(b, mask_1), 5);
|
||||
__m128i b1 = _mm_srli_si128(_mm_and_si128(b, mask_2), 6);
|
||||
__m128i b2 = _mm_srli_si128(_mm_and_si128(b, mask_3), 7);
|
||||
__m128i b3 = _mm_slli_si128(_mm_and_si128(c, mask_0), 8);
|
||||
__m128i b4 = _mm_slli_si128(_mm_and_si128(c, mask_1), 7);
|
||||
__m128i b5 = _mm_slli_si128(_mm_and_si128(c, mask_2), 6);
|
||||
|
||||
__m128i c0 = _mm_srli_si128(_mm_and_si128(c, mask_2), 10);
|
||||
__m128i c1 = _mm_srli_si128(_mm_and_si128(c, mask_3), 11);
|
||||
__m128i c2 = _mm_slli_si128(_mm_and_si128(d, mask_0), 4);
|
||||
__m128i c3 = _mm_slli_si128(_mm_and_si128(d, mask_1), 3);
|
||||
__m128i c4 = _mm_slli_si128(_mm_and_si128(d, mask_2), 2);
|
||||
__m128i c5 = _mm_slli_si128(_mm_and_si128(d, mask_3), 1);
|
||||
|
||||
__m128i *out = output;
|
||||
|
||||
_mm_storeu_si128(out + 0,
|
||||
_mm_or_si128(a0, _mm_or_si128(a1, _mm_or_si128(a2, _mm_or_si128(a3, _mm_or_si128(a4, a5))))));
|
||||
|
||||
_mm_storeu_si128(out + 1,
|
||||
_mm_or_si128(b0, _mm_or_si128(b1, _mm_or_si128(b2, _mm_or_si128(b3, _mm_or_si128(b4, b5))))));
|
||||
|
||||
_mm_storeu_si128(out + 2,
|
||||
_mm_or_si128(c0, _mm_or_si128(c1, _mm_or_si128(c2, _mm_or_si128(c3, _mm_or_si128(c4, c5))))));
|
||||
}
|
||||
|
||||
void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
{
|
||||
const uint16_t *input = (const uint16_t*)input_;
|
||||
uint8_t *output = (uint8_t*)output_;
|
||||
|
||||
const __m128i pix_mask_r = _mm_set1_epi16(0x1f << 10);
|
||||
const __m128i pix_mask_gb = _mm_set1_epi16(0x1f << 5);
|
||||
const __m128i mul15_mid = _mm_set1_epi16(0x4200);
|
||||
const __m128i mul15_hi = _mm_set1_epi16(0x0210);
|
||||
const __m128i a = _mm_set1_epi16(0x00ff);
|
||||
|
||||
int max_width = width - 15;
|
||||
|
||||
for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 1)
|
||||
{
|
||||
uint8_t *out = output;
|
||||
|
||||
int w;
|
||||
for (w = 0; w < max_width; w += 16, out += 48)
|
||||
{
|
||||
const __m128i in0 = _mm_loadu_si128((const __m128i*)(input + w + 0));
|
||||
const __m128i in1 = _mm_loadu_si128((const __m128i*)(input + w + 8));
|
||||
__m128i r0 = _mm_and_si128(in0, pix_mask_r);
|
||||
__m128i r1 = _mm_and_si128(in1, pix_mask_r);
|
||||
__m128i g0 = _mm_and_si128(in0, pix_mask_gb);
|
||||
__m128i g1 = _mm_and_si128(in1, pix_mask_gb);
|
||||
__m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
|
||||
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
|
||||
|
||||
r0 = _mm_mulhi_epi16(r0, mul15_hi);
|
||||
r1 = _mm_mulhi_epi16(r1, mul15_hi);
|
||||
g0 = _mm_mulhi_epi16(g0, mul15_mid);
|
||||
g1 = _mm_mulhi_epi16(g1, mul15_mid);
|
||||
b0 = _mm_mulhi_epi16(b0, mul15_mid);
|
||||
b1 = _mm_mulhi_epi16(b1, mul15_mid);
|
||||
|
||||
__m128i res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||
__m128i res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
|
||||
__m128i res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||
__m128i res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
|
||||
__m128i res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
|
||||
__m128i res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||
__m128i res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
|
||||
__m128i res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||
|
||||
__m128i res_lo0 = _mm_or_si128(res_lo_bg0, _mm_slli_si128(res_lo_ra0, 2));
|
||||
__m128i res_lo1 = _mm_or_si128(res_lo_bg1, _mm_slli_si128(res_lo_ra1, 2));
|
||||
__m128i res_hi0 = _mm_or_si128(res_hi_bg0, _mm_slli_si128(res_hi_ra0, 2));
|
||||
__m128i res_hi1 = _mm_or_si128(res_hi_bg1, _mm_slli_si128(res_hi_ra1, 2));
|
||||
|
||||
// Non-POT pixel sizes ftl :(
|
||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||
}
|
||||
|
||||
for (; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x1f;
|
||||
uint32_t r = (col >> 10) & 0x1f;
|
||||
b = (b << 3) | (b >> 2);
|
||||
g = (g << 3) | (g >> 2);
|
||||
r = (r << 3) | (r >> 2);
|
||||
|
||||
*out++ = b;
|
||||
*out++ = g;
|
||||
*out++ = r;
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
@ -131,6 +250,7 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void conv_bgr24_argb8888(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
@ -172,12 +292,46 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_,
|
||||
}
|
||||
}
|
||||
|
||||
#if defined(__SSE2__)
|
||||
void conv_argb8888_bgr24(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
{
|
||||
const uint32_t *input = (const uint32_t*)input_;
|
||||
uint8_t *output = (uint8_t*)output_;
|
||||
uint8_t *output = (uint8_t*)output_;
|
||||
|
||||
int max_width = width - 15;
|
||||
|
||||
for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 2)
|
||||
{
|
||||
uint8_t *out = output;
|
||||
int w;
|
||||
|
||||
for (w = 0; w < max_width; w += 16, out += 48)
|
||||
{
|
||||
store_bgr24_sse2(out,
|
||||
_mm_loadu_si128((const __m128i*)(input + w + 0)),
|
||||
_mm_loadu_si128((const __m128i*)(input + w + 4)),
|
||||
_mm_loadu_si128((const __m128i*)(input + w + 8)),
|
||||
_mm_loadu_si128((const __m128i*)(input + w + 12)));
|
||||
}
|
||||
|
||||
for (; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
*out++ = (uint8_t)(col >> 0);
|
||||
*out++ = (uint8_t)(col >> 8);
|
||||
*out++ = (uint8_t)(col >> 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
#else
|
||||
void conv_argb8888_bgr24(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
int out_stride, int in_stride)
|
||||
{
|
||||
const uint32_t *input = (const uint32_t*)input_;
|
||||
uint8_t *output = (uint8_t*)output_;
|
||||
|
||||
for (int h = 0; h < height; h++, output += out_stride, input += in_stride >> 2)
|
||||
{
|
||||
@ -191,6 +345,7 @@ void conv_argb8888_bgr24(void *output_, const void *input_,
|
||||
}
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
void conv_copy(void *output_, const void *input_,
|
||||
int width, int height,
|
||||
|
@ -22,6 +22,10 @@
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
|
||||
#ifdef SCALER_PERF
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
// In case aligned allocs are needed later ...
|
||||
void *scaler_alloc(size_t elem_size, size_t size)
|
||||
{
|
||||
@ -157,6 +161,14 @@ bool scaler_ctx_gen_filter(struct scaler_ctx *ctx)
|
||||
|
||||
void scaler_ctx_gen_reset(struct scaler_ctx *ctx)
|
||||
{
|
||||
#ifdef SCALER_PERF
|
||||
if (ctx->elapsed_frames)
|
||||
fprintf(stderr, "[Scaler]: ms / frame: %.3f\n", ctx->elapsed_time_ms / ctx->elapsed_frames);
|
||||
|
||||
ctx->elapsed_time_ms = 0.0;
|
||||
ctx->elapsed_frames = 0;
|
||||
#endif
|
||||
|
||||
scaler_free(ctx->horiz.filter);
|
||||
scaler_free(ctx->horiz.filter_pos);
|
||||
scaler_free(ctx->vert.filter);
|
||||
@ -172,9 +184,14 @@ void scaler_ctx_gen_reset(struct scaler_ctx *ctx)
|
||||
memset(&ctx->output, 0, sizeof(ctx->output));
|
||||
}
|
||||
|
||||
void scaler_ctx_scale(const struct scaler_ctx *ctx,
|
||||
void scaler_ctx_scale(struct scaler_ctx *ctx,
|
||||
void *output, const void *input)
|
||||
{
|
||||
#ifdef SCALER_PERF
|
||||
struct timespec start_tv, end_tv;
|
||||
clock_gettime(CLOCK_MONOTONIC, &start_tv);
|
||||
#endif
|
||||
|
||||
if (ctx->unscaled)
|
||||
{
|
||||
ctx->direct_pixconv(output, input,
|
||||
@ -205,6 +222,12 @@ void scaler_ctx_scale(const struct scaler_ctx *ctx,
|
||||
else
|
||||
ctx->scaler_vert(ctx, output, ctx->out_stride);
|
||||
}
|
||||
|
||||
#ifdef SCALER_PERF
|
||||
clock_gettime(CLOCK_MONOTONIC, &end_tv);
|
||||
ctx->elapsed_time_ms += (end_tv.tv_sec - start_tv.tv_sec) * 1000.0 + (end_tv.tv_nsec - start_tv.tv_nsec) / 1000000.0;
|
||||
ctx->elapsed_frames++;
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
@ -90,12 +90,17 @@ struct scaler_ctx
|
||||
uint32_t *frame;
|
||||
int stride;
|
||||
} output;
|
||||
|
||||
#ifdef SCALER_PERF
|
||||
double elapsed_time_ms;
|
||||
unsigned elapsed_frames;
|
||||
#endif
|
||||
};
|
||||
|
||||
bool scaler_ctx_gen_filter(struct scaler_ctx *ctx);
|
||||
void scaler_ctx_gen_reset(struct scaler_ctx *ctx);
|
||||
|
||||
void scaler_ctx_scale(const struct scaler_ctx *ctx,
|
||||
void scaler_ctx_scale(struct scaler_ctx *ctx,
|
||||
void *output, const void *input);
|
||||
|
||||
void *scaler_alloc(size_t elem_size, size_t size);
|
||||
|
@ -15,6 +15,10 @@
|
||||
|
||||
#include "scaler_int.h"
|
||||
|
||||
#ifdef SCALER_NO_SIMD
|
||||
#undef __SSE2__
|
||||
#endif
|
||||
|
||||
#if defined(__SSE2__)
|
||||
#include <emmintrin.h>
|
||||
#endif
|
||||
|
@ -40,6 +40,10 @@ extern "C" {
|
||||
#include "ffemu.h"
|
||||
#include <assert.h>
|
||||
|
||||
#ifdef FFEMU_PERF
|
||||
#include <time.h>
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_CONFIG_H
|
||||
#include "../config.h"
|
||||
#endif
|
||||
|
Loading…
x
Reference in New Issue
Block a user