Drop HAVE_PBO. Use custom conversion routines.

Measured custom 15->32 conversion to be up to 10x faster on my setup.
This commit is contained in:
Themaister 2012-10-02 00:58:43 +02:00
parent c54035e3f3
commit 5b0525d2e2
8 changed files with 51 additions and 129 deletions

View File

@ -144,6 +144,8 @@ endif
ifeq ($(HAVE_SDL), 1) ifeq ($(HAVE_SDL), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
else ifeq ($(HAVE_OPENGL), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
else ifeq ($(HAVE_FFMPEG), 1) else ifeq ($(HAVE_FFMPEG), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
endif endif

View File

@ -75,6 +75,8 @@ endif
ifeq ($(HAVE_SDL), 1) ifeq ($(HAVE_SDL), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
else ifeq ($(HAVE_OPENGL), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
else ifeq ($(HAVE_FFMPEG), 1) else ifeq ($(HAVE_FFMPEG), 1)
OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o OBJ += gfx/scaler/scaler.o gfx/scaler/pixconv.o gfx/scaler/scaler_int.o gfx/scaler/filter.o
endif endif

157
gfx/gl.c
View File

@ -15,6 +15,7 @@
#include "../driver.h" #include "../driver.h"
#include "../benchmark.h" #include "../benchmark.h"
#include "scaler/scaler.h"
#include <stdint.h> #include <stdint.h>
#include "../libretro.h" #include "../libretro.h"
@ -835,20 +836,8 @@ static void gl_update_input_size(gl_t *gl, unsigned width, unsigned height, unsi
gl->tex_w * gl->tex_h * gl->tex_index * gl->base_size, gl->tex_w * gl->tex_h * gl->tex_index * gl->base_size,
gl->tex_w * gl->tex_h * gl->base_size, gl->tex_w * gl->tex_h * gl->base_size,
gl->empty_buf); gl->empty_buf);
#elif defined(HAVE_PBO)
pglBindBuffer(GL_PIXEL_UNPACK_BUFFER, gl->pbo);
glBufferSubData(GL_PIXEL_UNPACK_BUFFER,
0, gl->tex_w * gl->tex_h * gl->base_size, gl->empty_buf);
glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(gl->tex_w * gl->base_size));
glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, gl->tex_w, gl->tex_h, gl->texture_type,
gl->texture_fmt, NULL);
pglBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
#else #else
glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * gl->base_size)); glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * sizeof(uint32_t)));
glTexSubImage2D(GL_TEXTURE_2D, glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, gl->tex_w, gl->tex_h, gl->texture_type, 0, 0, 0, gl->tex_w, gl->tex_h, gl->texture_type,
@ -870,6 +859,28 @@ static void gl_update_input_size(gl_t *gl, unsigned width, unsigned height, unsi
} }
} }
// It is *much* faster (order of mangnitude on my setup) to use a custom SIMD-optimized conversion routine than letting GL do it :(
#if !defined(HAVE_PSGL)
static inline void gl_convert_frame_rgb15_32(gl_t *gl, void *output, const void *input, unsigned width, unsigned height, unsigned in_pitch)
{
if (width != gl->scaler.in_width || height != gl->scaler.in_height)
{
gl->scaler.in_width = width;
gl->scaler.in_height = height;
gl->scaler.out_width = width;
gl->scaler.out_height = height;
gl->scaler.in_fmt = SCALER_FMT_0RGB1555;
gl->scaler.out_fmt = SCALER_FMT_ARGB8888;
gl->scaler.scaler_type = SCALER_TYPE_POINT;
scaler_ctx_gen_filter(&gl->scaler);
}
gl->scaler.in_stride = in_pitch;
gl->scaler.out_stride = width * sizeof(uint32_t);
scaler_ctx_scale(&gl->scaler, output, input);
}
#endif
#if defined(HAVE_PSGL) #if defined(HAVE_PSGL)
static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch) static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch)
{ {
@ -914,98 +925,29 @@ static void gl_init_textures(gl_t *gl)
} }
glBindTexture(GL_TEXTURE_2D, gl->texture[gl->tex_index]); glBindTexture(GL_TEXTURE_2D, gl->texture[gl->tex_index]);
} }
#elif defined(HAVE_PBO)
static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch)
{
const uint8_t *frame_copy = (const uint8_t*)frame;
size_t frame_copy_size = width * gl->base_size;
pglBindBuffer(GL_PIXEL_UNPACK_BUFFER, gl->pbo);
uint8_t *data = (uint8_t*)pglMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);
if (!data)
return;
for (unsigned h = 0; h < height; h++, data += frame_copy_size, frame_copy += pitch)
memcpy(data, frame_copy, frame_copy_size);
pglUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * gl->base_size));
glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, width, height, gl->texture_type,
gl->texture_fmt, NULL);
pglBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
}
static void gl_init_textures(gl_t *gl)
{
void *buf = pglMapBuffer(GL_PIXEL_UNPACK_BUFFER, GL_WRITE_ONLY);
if (buf)
{
memset(buf, 0, gl->tex_w * gl->tex_h * gl->base_size);
pglUnmapBuffer(GL_PIXEL_UNPACK_BUFFER);
}
glGenTextures(TEXTURES, gl->texture);
for (unsigned i = 0; i < TEXTURES; i++)
{
glBindTexture(GL_TEXTURE_2D, gl->texture[i]);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, gl->border_type);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, gl->border_type);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, gl->tex_filter);
glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, gl->tex_filter);
glTexImage2D(GL_TEXTURE_2D,
0, RARCH_GL_INTERNAL_FORMAT, gl->tex_w, gl->tex_h, 0, gl->texture_type,
gl->texture_fmt, NULL);
}
glBindTexture(GL_TEXTURE_2D, gl->texture[gl->tex_index]);
}
#else #else
static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch) static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, unsigned height, unsigned pitch)
{ {
if (gl->base_size == 2) // ARGB1555 => ARGB8888, SIMD-style :D
#ifdef HAVE_OPENGLES2 // Have to perform pixel format conversions as well.
glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * sizeof(uint32_t))); // Always use 32-bit textures.
if (gl->base_size == 2) // ARGB1555 => ARGB8888
{ {
const uint16_t *src = (const uint16_t*)frame; glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(width * sizeof(uint32_t))); // Always use 32-bit textures.
uint32_t *dst = (uint32_t*)gl->conv_buffer; gl_convert_frame_rgb15_32(gl, gl->conv_buffer, frame, width, height, pitch);
unsigned pitch_width = pitch >> 1;
// GL_UNSIGNED_BYTE apparently means in byte order, so go with little endian for now (ARGB).
// We have to convert anyways, prefer something that is more likely to be a native format for the GPU.
for (unsigned h = 0; h < height; h++, dst += width, src += pitch_width)
{
for (unsigned w = 0; w < width; w++)
{
uint32_t col = src[w];
uint32_t r = (col >> 10) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 3) | (g >> 2);
b = (b << 3) | (b >> 2);
dst[w] = (0xff << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
glTexSubImage2D(GL_TEXTURE_2D, glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, width, height, gl->texture_type, 0, 0, 0, width, height, gl->texture_type,
gl->texture_fmt, gl->conv_buffer); gl->texture_fmt, gl->conv_buffer);
} }
else else
{ {
#ifdef HAVE_OPENGLES2
// No GL_UNPACK_ROW_LENGTH ;(
unsigned pitch_width = pitch / gl->base_size; unsigned pitch_width = pitch / gl->base_size;
if (width == pitch_width) // Fast path :D if (width == pitch_width) // Happy path :D
{ {
glTexSubImage2D(GL_TEXTURE_2D, glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, width, height, gl->texture_type, 0, 0, 0, width, height, gl->texture_type,
gl->texture_fmt, gl->conv_buffer); gl->texture_fmt, frame);
} }
else else // Probably slower path.
{ {
const uint32_t *src = (const uint32_t*)frame; const uint32_t *src = (const uint32_t*)frame;
for (unsigned h = 0; h < height; h++, src += pitch_width) for (unsigned h = 0; h < height; h++, src += pitch_width)
@ -1015,15 +957,17 @@ static inline void gl_copy_frame(gl_t *gl, const void *frame, unsigned width, un
gl->texture_fmt, src); gl->texture_fmt, src);
} }
} }
}
#else #else
glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(pitch)); glPixelStorei(GL_UNPACK_ALIGNMENT, get_alignment(pitch));
glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch / gl->base_size); glPixelStorei(GL_UNPACK_ROW_LENGTH, pitch / gl->base_size);
glTexSubImage2D(GL_TEXTURE_2D,
0, 0, 0, width, height, gl->texture_type, glTexSubImage2D(GL_TEXTURE_2D,
gl->texture_fmt, frame); 0, 0, 0, width, height, gl->texture_type,
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0); gl->texture_fmt, frame);
glPixelStorei(GL_UNPACK_ROW_LENGTH, 0);
#endif #endif
}
} }
static void gl_init_textures(gl_t *gl) static void gl_init_textures(gl_t *gl)
@ -1197,9 +1141,6 @@ static void gl_free(void *data)
#if defined(HAVE_PSGL) #if defined(HAVE_PSGL)
glBindBuffer(GL_TEXTURE_REFERENCE_BUFFER_SCE, 0); glBindBuffer(GL_TEXTURE_REFERENCE_BUFFER_SCE, 0);
glDeleteBuffers(1, &gl->pbo); glDeleteBuffers(1, &gl->pbo);
#elif defined(HAVE_PBO)
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0);
glDeleteBuffers(1, &gl->pbo);
#endif #endif
#ifdef HAVE_FBO #ifdef HAVE_FBO
@ -1253,15 +1194,6 @@ static bool resolve_extensions(gl_t *gl)
RARCH_LOG("[GL] Supported extensions: %s\n", ext); RARCH_LOG("[GL] Supported extensions: %s\n", ext);
#endif #endif
#if defined(HAVE_PBO)
RARCH_LOG("[GL]: Using PBOs.\n");
if (!gl_query_extension("GL_ARB_pixel_buffer_object"))
{
RARCH_ERR("[GL]: PBOs are enabled, but extension does not exist ...\n");
return false;
}
#endif
return true; return true;
} }
@ -1403,17 +1335,12 @@ static void *gl_init(const video_info_t *video, const input_driver_t **input, vo
glBindBuffer(GL_TEXTURE_REFERENCE_BUFFER_SCE, gl->pbo); glBindBuffer(GL_TEXTURE_REFERENCE_BUFFER_SCE, gl->pbo);
glBufferData(GL_TEXTURE_REFERENCE_BUFFER_SCE, glBufferData(GL_TEXTURE_REFERENCE_BUFFER_SCE,
gl->tex_w * gl->tex_h * gl->base_size * TEXTURES, NULL, GL_STREAM_DRAW); gl->tex_w * gl->tex_h * gl->base_size * TEXTURES, NULL, GL_STREAM_DRAW);
#elif defined(HAVE_PBO)
glGenBuffers(1, &gl->pbo);
glBindBuffer(GL_PIXEL_UNPACK_BUFFER, gl->pbo);
glBufferData(GL_PIXEL_UNPACK_BUFFER,
gl->tex_w * gl->tex_h * gl->base_size, NULL, GL_STREAM_DRAW);
#endif #endif
// Empty buffer that we use to clear out the texture with on res change. // Empty buffer that we use to clear out the texture with on res change.
gl->empty_buf = calloc(sizeof(uint32_t), gl->tex_w * gl->tex_h); gl->empty_buf = calloc(sizeof(uint32_t), gl->tex_w * gl->tex_h);
#ifdef HAVE_OPENGLES2 #if !defined(HAVE_PSGL)
gl->conv_buffer = calloc(sizeof(uint32_t), gl->tex_w * gl->tex_h); gl->conv_buffer = calloc(sizeof(uint32_t), gl->tex_w * gl->tex_h);
if (!gl->conv_buffer) if (!gl->conv_buffer)
{ {

View File

@ -20,6 +20,7 @@
#include "fonts/fonts.h" #include "fonts/fonts.h"
#include "math/matrix.h" #include "math/matrix.h"
#include "gfx_context.h" #include "gfx_context.h"
#include "scaler/scaler.h"
#ifdef HAVE_CONFIG_H #ifdef HAVE_CONFIG_H
#include "../config.h" #include "../config.h"
@ -171,7 +172,9 @@ typedef struct gl
GLuint tex_filter; GLuint tex_filter;
void *empty_buf; void *empty_buf;
void *conv_buffer; void *conv_buffer;
struct scaler_ctx scaler;
unsigned frame_count; unsigned frame_count;
@ -256,7 +259,7 @@ extern PFNGLACTIVETEXTUREPROC pglActiveTexture;
#define RARCH_GL_INTERNAL_FORMAT GL_RGBA #define RARCH_GL_INTERNAL_FORMAT GL_RGBA
#define RARCH_GL_TEXTURE_TYPE GL_BGRA #define RARCH_GL_TEXTURE_TYPE GL_BGRA
#define RARCH_GL_FORMAT32 GL_UNSIGNED_INT_8_8_8_8_REV #define RARCH_GL_FORMAT32 GL_UNSIGNED_INT_8_8_8_8_REV
#define RARCH_GL_FORMAT16 GL_UNSIGNED_SHORT_1_5_5_5_REV #define RARCH_GL_FORMAT16 GL_UNSIGNED_INT_8_8_8_8_REV
#endif #endif
// Platform specific workarounds/hacks. // Platform specific workarounds/hacks.

View File

@ -23,10 +23,6 @@
#include <math.h> #include <math.h>
#include "../../benchmark.h" #include "../../benchmark.h"
#ifdef SCALER_PERF
#include <time.h>
#endif
// In case aligned allocs are needed later ... // In case aligned allocs are needed later ...
void *scaler_alloc(size_t elem_size, size_t size) void *scaler_alloc(size_t elem_size, size_t size)
{ {

View File

@ -92,11 +92,6 @@ struct scaler_ctx
uint32_t *frame; uint32_t *frame;
int stride; int stride;
} output; } output;
#ifdef SCALER_PERF
double elapsed_time_ms;
unsigned elapsed_frames;
#endif
}; };
bool scaler_ctx_gen_filter(struct scaler_ctx *ctx); bool scaler_ctx_gen_filter(struct scaler_ctx *ctx);

View File

@ -80,10 +80,8 @@ fi
if [ "$OS" = Darwin ]; then if [ "$OS" = Darwin ]; then
check_lib FBO "-framework OpenGL" glFramebufferTexture2D check_lib FBO "-framework OpenGL" glFramebufferTexture2D
check_lib PBO "-framework OpenGL" glMapBuffer
else else
check_lib FBO -lGL glFramebufferTexture2D check_lib FBO -lGL glFramebufferTexture2D
check_lib PBO -lGL glMapBuffer
fi fi
check_pkgconf RSOUND rsound 1.1 check_pkgconf RSOUND rsound 1.1
@ -186,6 +184,6 @@ check_pkgconf PYTHON python3
add_define_make OS "$OS" add_define_make OS "$OS"
# Creates config.mk and config.h. # Creates config.mk and config.h.
VARS="ALSA OSS OSS_BSD OSS_LIB AL RSOUND ROAR JACK COREAUDIO PULSE SDL OPENGL GLES VG EGL KMS GBM DRM DYLIB GETOPT_LONG THREADS CG XML SDL_IMAGE LIBPNG DYNAMIC FFMPEG AVCODEC AVFORMAT AVUTIL CONFIGFILE FREETYPE XVIDEO X11 XEXT XF86VM NETPLAY NETWORK_CMD STDIN_CMD COMMAND SOCKET_LEGACY FBO PBO STRL PYTHON FFMPEG_ALLOC_CONTEXT3 FFMPEG_AVCODEC_OPEN2 FFMPEG_AVIO_OPEN FFMPEG_AVFORMAT_WRITE_HEADER FFMPEG_AVFORMAT_NEW_STREAM FFMPEG_AVCODEC_ENCODE_AUDIO2 FFMPEG_AVCODEC_ENCODE_VIDEO2 SINC FIXED_POINT BSV_MOVIE VIDEOCORE" VARS="ALSA OSS OSS_BSD OSS_LIB AL RSOUND ROAR JACK COREAUDIO PULSE SDL OPENGL GLES VG EGL KMS GBM DRM DYLIB GETOPT_LONG THREADS CG XML SDL_IMAGE LIBPNG DYNAMIC FFMPEG AVCODEC AVFORMAT AVUTIL CONFIGFILE FREETYPE XVIDEO X11 XEXT XF86VM NETPLAY NETWORK_CMD STDIN_CMD COMMAND SOCKET_LEGACY FBO STRL PYTHON FFMPEG_ALLOC_CONTEXT3 FFMPEG_AVCODEC_OPEN2 FFMPEG_AVIO_OPEN FFMPEG_AVFORMAT_WRITE_HEADER FFMPEG_AVFORMAT_NEW_STREAM FFMPEG_AVCODEC_ENCODE_AUDIO2 FFMPEG_AVCODEC_ENCODE_VIDEO2 SINC FIXED_POINT BSV_MOVIE VIDEOCORE"
create_config_make config.mk $VARS create_config_make config.mk $VARS
create_config_header config.h $VARS create_config_header config.h $VARS

View File

@ -15,7 +15,6 @@ HAVE_VG=auto # Enable OpenVG support
HAVE_CG=auto # Enable Cg shader support HAVE_CG=auto # Enable Cg shader support
HAVE_XML=auto # Enable bSNES-style XML shader support HAVE_XML=auto # Enable bSNES-style XML shader support
HAVE_FBO=auto # Enable render-to-texture (FBO) support HAVE_FBO=auto # Enable render-to-texture (FBO) support
HAVE_PBO=no # Enable pixel buffer object (PBO) support
HAVE_ALSA=auto # Enable ALSA support HAVE_ALSA=auto # Enable ALSA support
HAVE_OSS=auto # Enable OSS support HAVE_OSS=auto # Enable OSS support
HAVE_RSOUND=auto # Enable RSound support HAVE_RSOUND=auto # Enable RSound support