(gfx/scaler) Cleanups

This commit is contained in:
twinaphex 2017-04-16 19:54:38 +02:00
parent f21bb4d0dc
commit 5b9a17dc8f
3 changed files with 171 additions and 157 deletions

View File

@ -46,8 +46,7 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_,
uint16_t *output = (uint16_t*)output_;
#if defined(__SSE2_)
int max_width = width - 7;
int max_width = width - 7;
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
const __m128i lo_mask = _mm_set1_epi16(0x1f);
#endif
@ -115,7 +114,7 @@ void conv_0rgb1555_rgb565(void *output_, const void *input_,
uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
uint16_t b = col & 0x1f;
uint16_t glow = (col >> 4) & (1 << 5);
output[w] = rg | b | glow;
output[w] = rg | b | glow;
}
}
}
@ -175,14 +174,14 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
for (; w < width; w++)
{
uint32_t col = input[w];
uint32_t r = (col >> 10) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 3) | (g >> 2);
b = (b << 3) | (b >> 2);
uint32_t r = (col >> 10) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 3) | (g >> 2);
b = (b << 3) | (b >> 2);
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
@ -217,22 +216,22 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
__m128i res_lo, res_hi;
__m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
__m128i g = _mm_and_si128(in, pix_mask_g);
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
__m128i g = _mm_and_si128(in, pix_mask_g);
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
r = _mm_mulhi_epi16(r, mul16_r);
g = _mm_mulhi_epi16(g, mul16_g);
b = _mm_mulhi_epi16(b, mul16_b);
r = _mm_mulhi_epi16(r, mul16_r);
g = _mm_mulhi_epi16(g, mul16_g);
b = _mm_mulhi_epi16(b, mul16_b);
res_lo_bg = _mm_unpacklo_epi8(b, g);
res_hi_bg = _mm_unpackhi_epi8(b, g);
res_lo_ra = _mm_unpacklo_epi8(r, a);
res_hi_ra = _mm_unpackhi_epi8(r, a);
res_lo_bg = _mm_unpacklo_epi8(b, g);
res_hi_bg = _mm_unpackhi_epi8(b, g);
res_lo_ra = _mm_unpacklo_epi8(r, a);
res_hi_ra = _mm_unpackhi_epi8(r, a);
res_lo = _mm_or_si128(res_lo_bg,
res_lo = _mm_or_si128(res_lo_bg,
_mm_slli_si128(res_lo_ra, 2));
res_hi = _mm_or_si128(res_hi_bg,
res_hi = _mm_or_si128(res_hi_bg,
_mm_slli_si128(res_hi_ra, 2));
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
@ -243,14 +242,14 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
for (; w < width; w++)
{
uint32_t col = input[w];
uint32_t r = (col >> 11) & 0x1f;
uint32_t g = (col >> 5) & 0x3f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 2) | (g >> 4);
b = (b << 3) | (b >> 2);
uint32_t r = (col >> 11) & 0x1f;
uint32_t g = (col >> 5) & 0x3f;
uint32_t b = (col >> 0) & 0x1f;
r = (r << 3) | (r >> 2);
g = (g << 2) | (g >> 4);
b = (b << 3) | (b >> 2);
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
@ -269,16 +268,16 @@ void conv_argb8888_rgba4444(void *output_, const void *input_,
for (w = 0; w < width; w++)
{
uint32_t col = input[w];
uint32_t r = (col >> 16) & 0xf;
uint32_t g = (col >> 8) & 0xf;
uint32_t b = (col) & 0xf;
uint32_t a = (col >> 24) & 0xf;
r = (r >> 4) | r;
g = (g >> 4) | g;
b = (b >> 4) | b;
a = (a >> 4) | a;
uint32_t r = (col >> 16) & 0xf;
uint32_t g = (col >> 8) & 0xf;
uint32_t b = (col) & 0xf;
uint32_t a = (col >> 24) & 0xf;
r = (r >> 4) | r;
g = (g >> 4) | g;
b = (b >> 4) | b;
a = (a >> 4) | a;
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
}
}
}
@ -297,16 +296,16 @@ void conv_rgba4444_argb8888(void *output_, const void *input_,
for (w = 0; w < width; w++)
{
uint32_t col = input[w];
uint32_t r = (col >> 12) & 0xf;
uint32_t g = (col >> 8) & 0xf;
uint32_t b = (col >> 4) & 0xf;
uint32_t a = (col >> 0) & 0xf;
r = (r << 4) | r;
g = (g << 4) | g;
b = (b << 4) | b;
a = (a << 4) | a;
uint32_t r = (col >> 12) & 0xf;
uint32_t g = (col >> 8) & 0xf;
uint32_t b = (col >> 4) & 0xf;
uint32_t a = (col >> 0) & 0xf;
r = (r << 4) | r;
g = (g << 4) | g;
b = (b << 4) | b;
a = (a << 4) | a;
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
@ -329,7 +328,7 @@ void conv_rgba4444_rgb565(void *output_, const void *input_,
uint32_t g = (col >> 8) & 0xf;
uint32_t b = (col >> 4) & 0xf;
output[w] = (r << 12) | (g << 7) | (b << 1);
output[w] = (r << 12) | (g << 7) | (b << 1);
}
}
}
@ -420,32 +419,32 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
__m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
r0 = _mm_mulhi_epi16(r0, mul15_hi);
r1 = _mm_mulhi_epi16(r1, mul15_hi);
g0 = _mm_mulhi_epi16(g0, mul15_mid);
g1 = _mm_mulhi_epi16(g1, mul15_mid);
b0 = _mm_mulhi_epi16(b0, mul15_mid);
b1 = _mm_mulhi_epi16(b1, mul15_mid);
r0 = _mm_mulhi_epi16(r0, mul15_hi);
r1 = _mm_mulhi_epi16(r1, mul15_hi);
g0 = _mm_mulhi_epi16(g0, mul15_mid);
g1 = _mm_mulhi_epi16(g1, mul15_mid);
b0 = _mm_mulhi_epi16(b0, mul15_mid);
b1 = _mm_mulhi_epi16(b1, mul15_mid);
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
res_lo0 = _mm_or_si128(res_lo_bg0,
res_lo0 = _mm_or_si128(res_lo_bg0,
_mm_slli_si128(res_lo_ra0, 2));
res_lo1 = _mm_or_si128(res_lo_bg1,
res_lo1 = _mm_or_si128(res_lo_bg1,
_mm_slli_si128(res_lo_ra1, 2));
res_hi0 = _mm_or_si128(res_hi_bg0,
res_hi0 = _mm_or_si128(res_hi_bg0,
_mm_slli_si128(res_hi_ra0, 2));
res_hi1 = _mm_or_si128(res_hi_bg1,
res_hi1 = _mm_or_si128(res_hi_bg1,
_mm_slli_si128(res_hi_ra1, 2));
/* Non-POT pixel sizes ftl :( */
/* Non-POT pixel sizes for the loss */
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
}
#endif
@ -456,13 +455,13 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
uint32_t b = (col >> 0) & 0x1f;
uint32_t g = (col >> 5) & 0x1f;
uint32_t r = (col >> 10) & 0x1f;
b = (b << 3) | (b >> 2);
g = (g << 3) | (g >> 2);
r = (r << 3) | (r >> 2);
b = (b << 3) | (b >> 2);
g = (g << 3) | (g >> 2);
r = (r << 3) | (r >> 2);
*out++ = b;
*out++ = g;
*out++ = r;
*out++ = b;
*out++ = g;
*out++ = r;
}
}
}
@ -506,12 +505,12 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
__m128i g1 = _mm_and_si128(in1, pix_mask_g);
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
r0 = _mm_mulhi_epi16(r0, mul16_r);
g0 = _mm_mulhi_epi16(g0, mul16_g);
b0 = _mm_mulhi_epi16(b0, mul16_b);
r1 = _mm_mulhi_epi16(r1, mul16_r);
g1 = _mm_mulhi_epi16(g1, mul16_g);
b1 = _mm_mulhi_epi16(b1, mul16_b);
r0 = _mm_mulhi_epi16(r0, mul16_r);
g0 = _mm_mulhi_epi16(g0, mul16_g);
b0 = _mm_mulhi_epi16(b0, mul16_b);
r1 = _mm_mulhi_epi16(r1, mul16_r);
g1 = _mm_mulhi_epi16(g1, mul16_g);
b1 = _mm_mulhi_epi16(b1, mul16_b);
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
@ -522,13 +521,13 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
res_lo0 = _mm_or_si128(res_lo_bg0,
res_lo0 = _mm_or_si128(res_lo_bg0,
_mm_slli_si128(res_lo_ra0, 2));
res_hi0 = _mm_or_si128(res_hi_bg0,
res_hi0 = _mm_or_si128(res_hi_bg0,
_mm_slli_si128(res_hi_ra0, 2));
res_lo1 = _mm_or_si128(res_lo_bg1,
res_lo1 = _mm_or_si128(res_lo_bg1,
_mm_slli_si128(res_lo_ra1, 2));
res_hi1 = _mm_or_si128(res_hi_bg1,
res_hi1 = _mm_or_si128(res_hi_bg1,
_mm_slli_si128(res_hi_ra1, 2));
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
@ -569,7 +568,7 @@ void conv_bgr24_argb8888(void *output_, const void *input_,
uint32_t b = *inp++;
uint32_t g = *inp++;
uint32_t r = *inp++;
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
}
}
}
@ -588,10 +587,10 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_,
for (w = 0; w < width; w++)
{
uint32_t col = input[w];
uint16_t r = (col >> 19) & 0x1f;
uint16_t g = (col >> 11) & 0x1f;
uint16_t b = (col >> 3) & 0x1f;
output[w] = (r << 10) | (g << 5) | (b << 0);
uint16_t r = (col >> 19) & 0x1f;
uint16_t g = (col >> 11) & 0x1f;
uint16_t b = (col >> 3) & 0x1f;
output[w] = (r << 10) | (g << 5) | (b << 0);
}
}
}
@ -627,9 +626,9 @@ void conv_argb8888_bgr24(void *output_, const void *input_,
for (; w < width; w++)
{
uint32_t col = input[w];
*out++ = (uint8_t)(col >> 0);
*out++ = (uint8_t)(col >> 8);
*out++ = (uint8_t)(col >> 16);
*out++ = (uint8_t)(col >> 0);
*out++ = (uint8_t)(col >> 8);
*out++ = (uint8_t)(col >> 16);
}
}
}
@ -648,7 +647,7 @@ void conv_argb8888_abgr8888(void *output_, const void *input_,
for (w = 0; w < width; w++)
{
uint32_t col = input[w];
output[w] = ((col << 16) & 0xff0000) |
output[w] = ((col << 16) & 0xff0000) |
((col >> 16) & 0xff) | (col & 0xff00ff00);
}
}
@ -793,8 +792,8 @@ void conv_yuyv_argb8888(void *output_, const void *input_,
uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
}
}
}
@ -809,7 +808,7 @@ void conv_copy(void *output_, const void *input_,
uint8_t *output = (uint8_t*)output_;
if (abs(in_stride) < copy_len)
copy_len = abs(in_stride);
copy_len = abs(in_stride);
for (h = 0; h < height;
h++, output += out_stride, input += in_stride)

View File

@ -198,8 +198,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
ctx->scaler_special = scaler_argb8888_point_special;
break;
@ -208,8 +208,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
break;
case SCALER_TYPE_SINC:
@ -231,7 +231,7 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
/* Makes sure that we never sample outside our rectangle. */
fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
return validate_filter(ctx);
}

View File

@ -38,19 +38,28 @@
/* ARGB8888 scaler is split in two:
*
* First, horizontal scaler is applied.
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
* The sign bit is kept empty as we have to do signed multiplication for the filter.
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
* to left to occupy 15 bits.
*
* The sign bit is kept empty as we have to do signed multiplication for the
* filter.
*
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is
* very efficient for SIMD.
* It is accurate enough for 8-bit purposes.
*
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
* with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
* the output is kept with 16-bit channels, and will now have 13 bits
* of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
*
* Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
* Vertical scaler takes the 13 bit channels, and performs the
* same mulhi steps.
* Another 2 bits of precision is lost, which ends up as 11 bits.
* Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
* Scaling is now complete. Channels are shifted right by 3, and saturated
* into 8-bit values.
*
* The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
* The C version of scalers perform the exact same operations as the
* SIMD code for testing purposes.
*/
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
const int16_t *filter_vert = ctx->vert.filter;
for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
for (h = 0; h < ctx->out_height; h++,
filter_vert += ctx->vert.filter_stride, output += stride >> 2)
{
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
const uint64_t *input_base = input + ctx->vert.filter_pos[h]
* (ctx->scaled.stride >> 3);
for (w = 0; w < ctx->out_width; w++)
{
@ -72,12 +83,13 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
__m128i final;
__m128i res = _mm_setzero_si128();
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
input_base_y += (ctx->scaled.stride >> 2))
{
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
@ -85,7 +97,7 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
__m128i col = _mm_set_epi64x(0, input_base_y[0]);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
@ -100,53 +112,52 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
int16_t res_g = 0;
int16_t res_b = 0;
for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
for (y = 0; y < ctx->vert.filter_len; y++,
input_base_y += (ctx->scaled.stride >> 3))
{
uint64_t col = *input_base_y;
uint64_t col = *input_base_y;
int16_t a = (col >> 48) & 0xffff;
int16_t r = (col >> 32) & 0xffff;
int16_t g = (col >> 16) & 0xffff;
int16_t b = (col >> 0) & 0xffff;
int16_t a = (col >> 48) & 0xffff;
int16_t r = (col >> 32) & 0xffff;
int16_t g = (col >> 16) & 0xffff;
int16_t b = (col >> 0) & 0xffff;
int16_t coeff = filter_vert[y];
int16_t coeff = filter_vert[y];
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
}
res_a >>= (7 - 2 - 2);
res_r >>= (7 - 2 - 2);
res_g >>= (7 - 2 - 2);
res_b >>= (7 - 2 - 2);
res_a >>= (7 - 2 - 2);
res_r >>= (7 - 2 - 2);
res_g >>= (7 - 2 - 2);
res_b >>= (7 - 2 - 2);
output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) |
(clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
output[w] =
(clamp_8bit(res_a) << 24) |
(clamp_8bit(res_r) << 16) |
(clamp_8bit(res_g) << 8) |
(clamp_8bit(res_b) << 0);
#endif
}
}
}
#if !defined(__SSE2__)
static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
{
return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
}
#endif
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
{
int h, w, x;
const uint32_t *input = (uint32_t*)input_;
uint64_t *output = ctx->scaled.frame;
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
output += ctx->scaled.stride >> 3)
{
const int16_t *filter_horiz = ctx->horiz.filter;
for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
for (w = 0; w < ctx->scaled.width; w++,
filter_horiz += ctx->horiz.filter_stride)
{
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
#if defined(__SSE2__)
@ -156,11 +167,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
{
__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
for (; x < ctx->horiz.filter_len; x++)
@ -168,14 +179,14 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
col = _mm_slli_epi16(col, 7);
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
}
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
#ifdef __x86_64__
output[w] = _mm_cvtsi128_si64(res);
output[w] = _mm_cvtsi128_si64(res);
#else /* 32-bit doesn't have si64. Do it in two steps. */
union
{
@ -194,22 +205,26 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
for (x = 0; x < ctx->horiz.filter_len; x++)
{
uint32_t col = input_base_x[x];
uint32_t col = input_base_x[x];
int16_t a = (col >> (24 - 7)) & (0xff << 7);
int16_t r = (col >> (16 - 7)) & (0xff << 7);
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
int16_t a = (col >> (24 - 7)) & (0xff << 7);
int16_t r = (col >> (16 - 7)) & (0xff << 7);
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
int16_t coeff = filter_horiz[x];
int16_t coeff = filter_horiz[x];
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
res_a += (a * coeff) >> 16;
res_r += (r * coeff) >> 16;
res_g += (g * coeff) >> 16;
res_b += (b * coeff) >> 16;
}
output[w] = build_argb64(res_a, res_r, res_g, res_b);
output[w] = (
(uint64_t)res_a << 48) |
((uint64_t)res_r << 32) |
((uint64_t)res_g << 16) |
((uint64_t)res_b << 0);
#endif
}
}