mirror of
https://github.com/libretro/RetroArch
synced 2025-02-05 06:40:07 +00:00
(gfx/scaler) Cleanups
This commit is contained in:
parent
f21bb4d0dc
commit
5b9a17dc8f
@ -46,8 +46,7 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_,
|
||||
uint16_t *output = (uint16_t*)output_;
|
||||
|
||||
#if defined(__SSE2_)
|
||||
int max_width = width - 7;
|
||||
|
||||
int max_width = width - 7;
|
||||
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
|
||||
const __m128i lo_mask = _mm_set1_epi16(0x1f);
|
||||
#endif
|
||||
@ -115,7 +114,7 @@ void conv_0rgb1555_rgb565(void *output_, const void *input_,
|
||||
uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
|
||||
uint16_t b = col & 0x1f;
|
||||
uint16_t glow = (col >> 4) & (1 << 5);
|
||||
output[w] = rg | b | glow;
|
||||
output[w] = rg | b | glow;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -175,14 +174,14 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
|
||||
for (; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint32_t r = (col >> 10) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x1f;
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
r = (r << 3) | (r >> 2);
|
||||
g = (g << 3) | (g >> 2);
|
||||
b = (b << 3) | (b >> 2);
|
||||
uint32_t r = (col >> 10) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x1f;
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
r = (r << 3) | (r >> 2);
|
||||
g = (g << 3) | (g >> 2);
|
||||
b = (b << 3) | (b >> 2);
|
||||
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -217,22 +216,22 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
|
||||
__m128i res_lo, res_hi;
|
||||
__m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
|
||||
const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
|
||||
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
|
||||
__m128i g = _mm_and_si128(in, pix_mask_g);
|
||||
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
|
||||
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
|
||||
__m128i g = _mm_and_si128(in, pix_mask_g);
|
||||
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
|
||||
|
||||
r = _mm_mulhi_epi16(r, mul16_r);
|
||||
g = _mm_mulhi_epi16(g, mul16_g);
|
||||
b = _mm_mulhi_epi16(b, mul16_b);
|
||||
r = _mm_mulhi_epi16(r, mul16_r);
|
||||
g = _mm_mulhi_epi16(g, mul16_g);
|
||||
b = _mm_mulhi_epi16(b, mul16_b);
|
||||
|
||||
res_lo_bg = _mm_unpacklo_epi8(b, g);
|
||||
res_hi_bg = _mm_unpackhi_epi8(b, g);
|
||||
res_lo_ra = _mm_unpacklo_epi8(r, a);
|
||||
res_hi_ra = _mm_unpackhi_epi8(r, a);
|
||||
res_lo_bg = _mm_unpacklo_epi8(b, g);
|
||||
res_hi_bg = _mm_unpackhi_epi8(b, g);
|
||||
res_lo_ra = _mm_unpacklo_epi8(r, a);
|
||||
res_hi_ra = _mm_unpackhi_epi8(r, a);
|
||||
|
||||
res_lo = _mm_or_si128(res_lo_bg,
|
||||
res_lo = _mm_or_si128(res_lo_bg,
|
||||
_mm_slli_si128(res_lo_ra, 2));
|
||||
res_hi = _mm_or_si128(res_hi_bg,
|
||||
res_hi = _mm_or_si128(res_hi_bg,
|
||||
_mm_slli_si128(res_hi_ra, 2));
|
||||
|
||||
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
|
||||
@ -243,14 +242,14 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
|
||||
for (; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint32_t r = (col >> 11) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x3f;
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
r = (r << 3) | (r >> 2);
|
||||
g = (g << 2) | (g >> 4);
|
||||
b = (b << 3) | (b >> 2);
|
||||
uint32_t r = (col >> 11) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x3f;
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
r = (r << 3) | (r >> 2);
|
||||
g = (g << 2) | (g >> 4);
|
||||
b = (b << 3) | (b >> 2);
|
||||
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -269,16 +268,16 @@ void conv_argb8888_rgba4444(void *output_, const void *input_,
|
||||
for (w = 0; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint32_t r = (col >> 16) & 0xf;
|
||||
uint32_t g = (col >> 8) & 0xf;
|
||||
uint32_t b = (col) & 0xf;
|
||||
uint32_t a = (col >> 24) & 0xf;
|
||||
r = (r >> 4) | r;
|
||||
g = (g >> 4) | g;
|
||||
b = (b >> 4) | b;
|
||||
a = (a >> 4) | a;
|
||||
uint32_t r = (col >> 16) & 0xf;
|
||||
uint32_t g = (col >> 8) & 0xf;
|
||||
uint32_t b = (col) & 0xf;
|
||||
uint32_t a = (col >> 24) & 0xf;
|
||||
r = (r >> 4) | r;
|
||||
g = (g >> 4) | g;
|
||||
b = (b >> 4) | b;
|
||||
a = (a >> 4) | a;
|
||||
|
||||
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
|
||||
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -297,16 +296,16 @@ void conv_rgba4444_argb8888(void *output_, const void *input_,
|
||||
for (w = 0; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint32_t r = (col >> 12) & 0xf;
|
||||
uint32_t g = (col >> 8) & 0xf;
|
||||
uint32_t b = (col >> 4) & 0xf;
|
||||
uint32_t a = (col >> 0) & 0xf;
|
||||
r = (r << 4) | r;
|
||||
g = (g << 4) | g;
|
||||
b = (b << 4) | b;
|
||||
a = (a << 4) | a;
|
||||
uint32_t r = (col >> 12) & 0xf;
|
||||
uint32_t g = (col >> 8) & 0xf;
|
||||
uint32_t b = (col >> 4) & 0xf;
|
||||
uint32_t a = (col >> 0) & 0xf;
|
||||
r = (r << 4) | r;
|
||||
g = (g << 4) | g;
|
||||
b = (b << 4) | b;
|
||||
a = (a << 4) | a;
|
||||
|
||||
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -329,7 +328,7 @@ void conv_rgba4444_rgb565(void *output_, const void *input_,
|
||||
uint32_t g = (col >> 8) & 0xf;
|
||||
uint32_t b = (col >> 4) & 0xf;
|
||||
|
||||
output[w] = (r << 12) | (g << 7) | (b << 1);
|
||||
output[w] = (r << 12) | (g << 7) | (b << 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -420,32 +419,32 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
||||
__m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
|
||||
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
|
||||
|
||||
r0 = _mm_mulhi_epi16(r0, mul15_hi);
|
||||
r1 = _mm_mulhi_epi16(r1, mul15_hi);
|
||||
g0 = _mm_mulhi_epi16(g0, mul15_mid);
|
||||
g1 = _mm_mulhi_epi16(g1, mul15_mid);
|
||||
b0 = _mm_mulhi_epi16(b0, mul15_mid);
|
||||
b1 = _mm_mulhi_epi16(b1, mul15_mid);
|
||||
r0 = _mm_mulhi_epi16(r0, mul15_hi);
|
||||
r1 = _mm_mulhi_epi16(r1, mul15_hi);
|
||||
g0 = _mm_mulhi_epi16(g0, mul15_mid);
|
||||
g1 = _mm_mulhi_epi16(g1, mul15_mid);
|
||||
b0 = _mm_mulhi_epi16(b0, mul15_mid);
|
||||
b1 = _mm_mulhi_epi16(b1, mul15_mid);
|
||||
|
||||
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
|
||||
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
|
||||
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
|
||||
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
|
||||
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
|
||||
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
|
||||
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
|
||||
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
|
||||
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||
|
||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||
_mm_slli_si128(res_lo_ra0, 2));
|
||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||
_mm_slli_si128(res_lo_ra1, 2));
|
||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||
_mm_slli_si128(res_hi_ra0, 2));
|
||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||
_mm_slli_si128(res_hi_ra1, 2));
|
||||
|
||||
/* Non-POT pixel sizes ftl :( */
|
||||
/* Non-POT pixel sizes for the loss */
|
||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||
}
|
||||
#endif
|
||||
@ -456,13 +455,13 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
||||
uint32_t b = (col >> 0) & 0x1f;
|
||||
uint32_t g = (col >> 5) & 0x1f;
|
||||
uint32_t r = (col >> 10) & 0x1f;
|
||||
b = (b << 3) | (b >> 2);
|
||||
g = (g << 3) | (g >> 2);
|
||||
r = (r << 3) | (r >> 2);
|
||||
b = (b << 3) | (b >> 2);
|
||||
g = (g << 3) | (g >> 2);
|
||||
r = (r << 3) | (r >> 2);
|
||||
|
||||
*out++ = b;
|
||||
*out++ = g;
|
||||
*out++ = r;
|
||||
*out++ = b;
|
||||
*out++ = g;
|
||||
*out++ = r;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -506,12 +505,12 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
|
||||
__m128i g1 = _mm_and_si128(in1, pix_mask_g);
|
||||
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
|
||||
|
||||
r0 = _mm_mulhi_epi16(r0, mul16_r);
|
||||
g0 = _mm_mulhi_epi16(g0, mul16_g);
|
||||
b0 = _mm_mulhi_epi16(b0, mul16_b);
|
||||
r1 = _mm_mulhi_epi16(r1, mul16_r);
|
||||
g1 = _mm_mulhi_epi16(g1, mul16_g);
|
||||
b1 = _mm_mulhi_epi16(b1, mul16_b);
|
||||
r0 = _mm_mulhi_epi16(r0, mul16_r);
|
||||
g0 = _mm_mulhi_epi16(g0, mul16_g);
|
||||
b0 = _mm_mulhi_epi16(b0, mul16_b);
|
||||
r1 = _mm_mulhi_epi16(r1, mul16_r);
|
||||
g1 = _mm_mulhi_epi16(g1, mul16_g);
|
||||
b1 = _mm_mulhi_epi16(b1, mul16_b);
|
||||
|
||||
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||
@ -522,13 +521,13 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
|
||||
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||
|
||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||
_mm_slli_si128(res_lo_ra0, 2));
|
||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||
_mm_slli_si128(res_hi_ra0, 2));
|
||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||
_mm_slli_si128(res_lo_ra1, 2));
|
||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||
_mm_slli_si128(res_hi_ra1, 2));
|
||||
|
||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||
@ -569,7 +568,7 @@ void conv_bgr24_argb8888(void *output_, const void *input_,
|
||||
uint32_t b = *inp++;
|
||||
uint32_t g = *inp++;
|
||||
uint32_t r = *inp++;
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -588,10 +587,10 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_,
|
||||
for (w = 0; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
uint16_t r = (col >> 19) & 0x1f;
|
||||
uint16_t g = (col >> 11) & 0x1f;
|
||||
uint16_t b = (col >> 3) & 0x1f;
|
||||
output[w] = (r << 10) | (g << 5) | (b << 0);
|
||||
uint16_t r = (col >> 19) & 0x1f;
|
||||
uint16_t g = (col >> 11) & 0x1f;
|
||||
uint16_t b = (col >> 3) & 0x1f;
|
||||
output[w] = (r << 10) | (g << 5) | (b << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -627,9 +626,9 @@ void conv_argb8888_bgr24(void *output_, const void *input_,
|
||||
for (; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
*out++ = (uint8_t)(col >> 0);
|
||||
*out++ = (uint8_t)(col >> 8);
|
||||
*out++ = (uint8_t)(col >> 16);
|
||||
*out++ = (uint8_t)(col >> 0);
|
||||
*out++ = (uint8_t)(col >> 8);
|
||||
*out++ = (uint8_t)(col >> 16);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -648,7 +647,7 @@ void conv_argb8888_abgr8888(void *output_, const void *input_,
|
||||
for (w = 0; w < width; w++)
|
||||
{
|
||||
uint32_t col = input[w];
|
||||
output[w] = ((col << 16) & 0xff0000) |
|
||||
output[w] = ((col << 16) & 0xff0000) |
|
||||
((col >> 16) & 0xff) | (col & 0xff00ff00);
|
||||
}
|
||||
}
|
||||
@ -793,8 +792,8 @@ void conv_yuyv_argb8888(void *output_, const void *input_,
|
||||
uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||
uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||
|
||||
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
||||
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
||||
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
||||
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -809,7 +808,7 @@ void conv_copy(void *output_, const void *input_,
|
||||
uint8_t *output = (uint8_t*)output_;
|
||||
|
||||
if (abs(in_stride) < copy_len)
|
||||
copy_len = abs(in_stride);
|
||||
copy_len = abs(in_stride);
|
||||
|
||||
for (h = 0; h < height;
|
||||
h++, output += out_stride, input += in_stride)
|
||||
|
@ -198,8 +198,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
||||
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
||||
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
||||
|
||||
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||
|
||||
ctx->scaler_special = scaler_argb8888_point_special;
|
||||
break;
|
||||
@ -208,8 +208,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
||||
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
||||
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
||||
|
||||
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||
break;
|
||||
|
||||
case SCALER_TYPE_SINC:
|
||||
@ -231,7 +231,7 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
||||
|
||||
/* Makes sure that we never sample outside our rectangle. */
|
||||
fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
|
||||
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
|
||||
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
|
||||
|
||||
return validate_filter(ctx);
|
||||
}
|
||||
|
@ -38,19 +38,28 @@
|
||||
/* ARGB8888 scaler is split in two:
|
||||
*
|
||||
* First, horizontal scaler is applied.
|
||||
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
|
||||
* The sign bit is kept empty as we have to do signed multiplication for the filter.
|
||||
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
|
||||
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
|
||||
* to left to occupy 15 bits.
|
||||
*
|
||||
* The sign bit is kept empty as we have to do signed multiplication for the
|
||||
* filter.
|
||||
*
|
||||
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is
|
||||
* very efficient for SIMD.
|
||||
* It is accurate enough for 8-bit purposes.
|
||||
*
|
||||
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
|
||||
* with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
||||
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
|
||||
* the output is kept with 16-bit channels, and will now have 13 bits
|
||||
* of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
||||
*
|
||||
* Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
|
||||
* Vertical scaler takes the 13 bit channels, and performs the
|
||||
* same mulhi steps.
|
||||
* Another 2 bits of precision is lost, which ends up as 11 bits.
|
||||
* Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
|
||||
* Scaling is now complete. Channels are shifted right by 3, and saturated
|
||||
* into 8-bit values.
|
||||
*
|
||||
* The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
|
||||
* The C version of scalers perform the exact same operations as the
|
||||
* SIMD code for testing purposes.
|
||||
*/
|
||||
|
||||
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
|
||||
@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
||||
|
||||
const int16_t *filter_vert = ctx->vert.filter;
|
||||
|
||||
for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
||||
for (h = 0; h < ctx->out_height; h++,
|
||||
filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
||||
{
|
||||
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
|
||||
const uint64_t *input_base = input + ctx->vert.filter_pos[h]
|
||||
* (ctx->scaled.stride >> 3);
|
||||
|
||||
for (w = 0; w < ctx->out_width; w++)
|
||||
{
|
||||
@ -72,12 +83,13 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
||||
__m128i final;
|
||||
__m128i res = _mm_setzero_si128();
|
||||
|
||||
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
|
||||
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
|
||||
input_base_y += (ctx->scaled.stride >> 2))
|
||||
{
|
||||
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
|
||||
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
|
||||
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
}
|
||||
|
||||
for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
||||
@ -85,7 +97,7 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
||||
__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
|
||||
__m128i col = _mm_set_epi64x(0, input_base_y[0]);
|
||||
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
}
|
||||
|
||||
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
||||
@ -100,53 +112,52 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
||||
int16_t res_g = 0;
|
||||
int16_t res_b = 0;
|
||||
|
||||
for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
||||
for (y = 0; y < ctx->vert.filter_len; y++,
|
||||
input_base_y += (ctx->scaled.stride >> 3))
|
||||
{
|
||||
uint64_t col = *input_base_y;
|
||||
uint64_t col = *input_base_y;
|
||||
|
||||
int16_t a = (col >> 48) & 0xffff;
|
||||
int16_t r = (col >> 32) & 0xffff;
|
||||
int16_t g = (col >> 16) & 0xffff;
|
||||
int16_t b = (col >> 0) & 0xffff;
|
||||
int16_t a = (col >> 48) & 0xffff;
|
||||
int16_t r = (col >> 32) & 0xffff;
|
||||
int16_t g = (col >> 16) & 0xffff;
|
||||
int16_t b = (col >> 0) & 0xffff;
|
||||
|
||||
int16_t coeff = filter_vert[y];
|
||||
int16_t coeff = filter_vert[y];
|
||||
|
||||
res_a += (a * coeff) >> 16;
|
||||
res_r += (r * coeff) >> 16;
|
||||
res_g += (g * coeff) >> 16;
|
||||
res_b += (b * coeff) >> 16;
|
||||
res_a += (a * coeff) >> 16;
|
||||
res_r += (r * coeff) >> 16;
|
||||
res_g += (g * coeff) >> 16;
|
||||
res_b += (b * coeff) >> 16;
|
||||
}
|
||||
|
||||
res_a >>= (7 - 2 - 2);
|
||||
res_r >>= (7 - 2 - 2);
|
||||
res_g >>= (7 - 2 - 2);
|
||||
res_b >>= (7 - 2 - 2);
|
||||
res_a >>= (7 - 2 - 2);
|
||||
res_r >>= (7 - 2 - 2);
|
||||
res_g >>= (7 - 2 - 2);
|
||||
res_b >>= (7 - 2 - 2);
|
||||
|
||||
output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) |
|
||||
(clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
|
||||
output[w] =
|
||||
(clamp_8bit(res_a) << 24) |
|
||||
(clamp_8bit(res_r) << 16) |
|
||||
(clamp_8bit(res_g) << 8) |
|
||||
(clamp_8bit(res_b) << 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#if !defined(__SSE2__)
|
||||
static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
|
||||
{
|
||||
return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
|
||||
}
|
||||
#endif
|
||||
|
||||
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
|
||||
{
|
||||
int h, w, x;
|
||||
const uint32_t *input = (uint32_t*)input_;
|
||||
uint64_t *output = ctx->scaled.frame;
|
||||
|
||||
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
|
||||
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
|
||||
output += ctx->scaled.stride >> 3)
|
||||
{
|
||||
const int16_t *filter_horiz = ctx->horiz.filter;
|
||||
|
||||
for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
|
||||
for (w = 0; w < ctx->scaled.width; w++,
|
||||
filter_horiz += ctx->horiz.filter_stride)
|
||||
{
|
||||
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
|
||||
#if defined(__SSE2__)
|
||||
@ -156,11 +167,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
||||
{
|
||||
__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
|
||||
|
||||
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
|
||||
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
|
||||
((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
|
||||
|
||||
col = _mm_slli_epi16(col, 7);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
col = _mm_slli_epi16(col, 7);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
}
|
||||
|
||||
for (; x < ctx->horiz.filter_len; x++)
|
||||
@ -168,14 +179,14 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
||||
__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
|
||||
__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
|
||||
|
||||
col = _mm_slli_epi16(col, 7);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
col = _mm_slli_epi16(col, 7);
|
||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||
}
|
||||
|
||||
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
||||
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
||||
|
||||
#ifdef __x86_64__
|
||||
output[w] = _mm_cvtsi128_si64(res);
|
||||
output[w] = _mm_cvtsi128_si64(res);
|
||||
#else /* 32-bit doesn't have si64. Do it in two steps. */
|
||||
union
|
||||
{
|
||||
@ -194,22 +205,26 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
||||
|
||||
for (x = 0; x < ctx->horiz.filter_len; x++)
|
||||
{
|
||||
uint32_t col = input_base_x[x];
|
||||
uint32_t col = input_base_x[x];
|
||||
|
||||
int16_t a = (col >> (24 - 7)) & (0xff << 7);
|
||||
int16_t r = (col >> (16 - 7)) & (0xff << 7);
|
||||
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
|
||||
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
|
||||
int16_t a = (col >> (24 - 7)) & (0xff << 7);
|
||||
int16_t r = (col >> (16 - 7)) & (0xff << 7);
|
||||
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
|
||||
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
|
||||
|
||||
int16_t coeff = filter_horiz[x];
|
||||
int16_t coeff = filter_horiz[x];
|
||||
|
||||
res_a += (a * coeff) >> 16;
|
||||
res_r += (r * coeff) >> 16;
|
||||
res_g += (g * coeff) >> 16;
|
||||
res_b += (b * coeff) >> 16;
|
||||
res_a += (a * coeff) >> 16;
|
||||
res_r += (r * coeff) >> 16;
|
||||
res_g += (g * coeff) >> 16;
|
||||
res_b += (b * coeff) >> 16;
|
||||
}
|
||||
|
||||
output[w] = build_argb64(res_a, res_r, res_g, res_b);
|
||||
output[w] = (
|
||||
(uint64_t)res_a << 48) |
|
||||
((uint64_t)res_r << 32) |
|
||||
((uint64_t)res_g << 16) |
|
||||
((uint64_t)res_b << 0);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
Loading…
x
Reference in New Issue
Block a user