mirror of
https://github.com/libretro/RetroArch
synced 2025-02-13 21:40:27 +00:00
(gfx/scaler) Cleanups
This commit is contained in:
parent
f21bb4d0dc
commit
5b9a17dc8f
@ -47,7 +47,6 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_,
|
|||||||
|
|
||||||
#if defined(__SSE2_)
|
#if defined(__SSE2_)
|
||||||
int max_width = width - 7;
|
int max_width = width - 7;
|
||||||
|
|
||||||
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
|
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
|
||||||
const __m128i lo_mask = _mm_set1_epi16(0x1f);
|
const __m128i lo_mask = _mm_set1_epi16(0x1f);
|
||||||
#endif
|
#endif
|
||||||
@ -445,7 +444,7 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
|||||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||||
_mm_slli_si128(res_hi_ra1, 2));
|
_mm_slli_si128(res_hi_ra1, 2));
|
||||||
|
|
||||||
/* Non-POT pixel sizes ftl :( */
|
/* Non-POT pixel sizes for the loss */
|
||||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
@ -38,19 +38,28 @@
|
|||||||
/* ARGB8888 scaler is split in two:
|
/* ARGB8888 scaler is split in two:
|
||||||
*
|
*
|
||||||
* First, horizontal scaler is applied.
|
* First, horizontal scaler is applied.
|
||||||
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
|
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
|
||||||
* The sign bit is kept empty as we have to do signed multiplication for the filter.
|
* to left to occupy 15 bits.
|
||||||
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
|
*
|
||||||
|
* The sign bit is kept empty as we have to do signed multiplication for the
|
||||||
|
* filter.
|
||||||
|
*
|
||||||
|
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is
|
||||||
|
* very efficient for SIMD.
|
||||||
* It is accurate enough for 8-bit purposes.
|
* It is accurate enough for 8-bit purposes.
|
||||||
*
|
*
|
||||||
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
|
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
|
||||||
* with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
* the output is kept with 16-bit channels, and will now have 13 bits
|
||||||
|
* of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
||||||
*
|
*
|
||||||
* Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
|
* Vertical scaler takes the 13 bit channels, and performs the
|
||||||
|
* same mulhi steps.
|
||||||
* Another 2 bits of precision is lost, which ends up as 11 bits.
|
* Another 2 bits of precision is lost, which ends up as 11 bits.
|
||||||
* Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
|
* Scaling is now complete. Channels are shifted right by 3, and saturated
|
||||||
|
* into 8-bit values.
|
||||||
*
|
*
|
||||||
* The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
|
* The C version of scalers perform the exact same operations as the
|
||||||
|
* SIMD code for testing purposes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
|
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
|
||||||
@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
|
|
||||||
const int16_t *filter_vert = ctx->vert.filter;
|
const int16_t *filter_vert = ctx->vert.filter;
|
||||||
|
|
||||||
for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
for (h = 0; h < ctx->out_height; h++,
|
||||||
|
filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
||||||
{
|
{
|
||||||
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
|
const uint64_t *input_base = input + ctx->vert.filter_pos[h]
|
||||||
|
* (ctx->scaled.stride >> 3);
|
||||||
|
|
||||||
for (w = 0; w < ctx->out_width; w++)
|
for (w = 0; w < ctx->out_width; w++)
|
||||||
{
|
{
|
||||||
@ -72,7 +83,8 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
__m128i final;
|
__m128i final;
|
||||||
__m128i res = _mm_setzero_si128();
|
__m128i res = _mm_setzero_si128();
|
||||||
|
|
||||||
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
|
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
|
||||||
|
input_base_y += (ctx->scaled.stride >> 2))
|
||||||
{
|
{
|
||||||
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
|
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
|
||||||
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
|
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
|
||||||
@ -100,7 +112,8 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
int16_t res_g = 0;
|
int16_t res_g = 0;
|
||||||
int16_t res_b = 0;
|
int16_t res_b = 0;
|
||||||
|
|
||||||
for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
for (y = 0; y < ctx->vert.filter_len; y++,
|
||||||
|
input_base_y += (ctx->scaled.stride >> 3))
|
||||||
{
|
{
|
||||||
uint64_t col = *input_base_y;
|
uint64_t col = *input_base_y;
|
||||||
|
|
||||||
@ -122,31 +135,29 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
res_g >>= (7 - 2 - 2);
|
res_g >>= (7 - 2 - 2);
|
||||||
res_b >>= (7 - 2 - 2);
|
res_b >>= (7 - 2 - 2);
|
||||||
|
|
||||||
output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) |
|
output[w] =
|
||||||
(clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
|
(clamp_8bit(res_a) << 24) |
|
||||||
|
(clamp_8bit(res_r) << 16) |
|
||||||
|
(clamp_8bit(res_g) << 8) |
|
||||||
|
(clamp_8bit(res_b) << 0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(__SSE2__)
|
|
||||||
static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
|
|
||||||
{
|
|
||||||
return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
|
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
|
||||||
{
|
{
|
||||||
int h, w, x;
|
int h, w, x;
|
||||||
const uint32_t *input = (uint32_t*)input_;
|
const uint32_t *input = (uint32_t*)input_;
|
||||||
uint64_t *output = ctx->scaled.frame;
|
uint64_t *output = ctx->scaled.frame;
|
||||||
|
|
||||||
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
|
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
|
||||||
|
output += ctx->scaled.stride >> 3)
|
||||||
{
|
{
|
||||||
const int16_t *filter_horiz = ctx->horiz.filter;
|
const int16_t *filter_horiz = ctx->horiz.filter;
|
||||||
|
|
||||||
for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
|
for (w = 0; w < ctx->scaled.width; w++,
|
||||||
|
filter_horiz += ctx->horiz.filter_stride)
|
||||||
{
|
{
|
||||||
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
|
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
@ -209,7 +220,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
|||||||
res_b += (b * coeff) >> 16;
|
res_b += (b * coeff) >> 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
output[w] = build_argb64(res_a, res_r, res_g, res_b);
|
output[w] = (
|
||||||
|
(uint64_t)res_a << 48) |
|
||||||
|
((uint64_t)res_r << 32) |
|
||||||
|
((uint64_t)res_g << 16) |
|
||||||
|
((uint64_t)res_b << 0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user