From 5b9a17dc8fe3d9b15d62642774cc5156f97298b7 Mon Sep 17 00:00:00 2001 From: twinaphex Date: Sun, 16 Apr 2017 19:54:38 +0200 Subject: [PATCH] (gfx/scaler) Cleanups --- libretro-common/gfx/scaler/pixconv.c | 189 ++++++++++----------- libretro-common/gfx/scaler/scaler_filter.c | 10 +- libretro-common/gfx/scaler/scaler_int.c | 129 +++++++------- 3 files changed, 171 insertions(+), 157 deletions(-) diff --git a/libretro-common/gfx/scaler/pixconv.c b/libretro-common/gfx/scaler/pixconv.c index 09cf34cefb..7097952f82 100644 --- a/libretro-common/gfx/scaler/pixconv.c +++ b/libretro-common/gfx/scaler/pixconv.c @@ -46,8 +46,7 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_, uint16_t *output = (uint16_t*)output_; #if defined(__SSE2_) - int max_width = width - 7; - + int max_width = width - 7; const __m128i hi_mask = _mm_set1_epi16(0x7fe0); const __m128i lo_mask = _mm_set1_epi16(0x1f); #endif @@ -115,7 +114,7 @@ void conv_0rgb1555_rgb565(void *output_, const void *input_, uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6)); uint16_t b = col & 0x1f; uint16_t glow = (col >> 4) & (1 << 5); - output[w] = rg | b | glow; + output[w] = rg | b | glow; } } } @@ -175,14 +174,14 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_, for (; w < width; w++) { uint32_t col = input[w]; - uint32_t r = (col >> 10) & 0x1f; - uint32_t g = (col >> 5) & 0x1f; - uint32_t b = (col >> 0) & 0x1f; - r = (r << 3) | (r >> 2); - g = (g << 3) | (g >> 2); - b = (b << 3) | (b >> 2); + uint32_t r = (col >> 10) & 0x1f; + uint32_t g = (col >> 5) & 0x1f; + uint32_t b = (col >> 0) & 0x1f; + r = (r << 3) | (r >> 2); + g = (g << 3) | (g >> 2); + b = (b << 3) | (b >> 2); - output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); + output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); } } } @@ -217,22 +216,22 @@ void conv_rgb565_argb8888(void *output_, const void *input_, __m128i res_lo, res_hi; __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra; const __m128i in = _mm_loadu_si128((const __m128i*)(input + w)); - __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r); - __m128i g = _mm_and_si128(in, pix_mask_g); - __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b); + __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r); + __m128i g = _mm_and_si128(in, pix_mask_g); + __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b); - r = _mm_mulhi_epi16(r, mul16_r); - g = _mm_mulhi_epi16(g, mul16_g); - b = _mm_mulhi_epi16(b, mul16_b); + r = _mm_mulhi_epi16(r, mul16_r); + g = _mm_mulhi_epi16(g, mul16_g); + b = _mm_mulhi_epi16(b, mul16_b); - res_lo_bg = _mm_unpacklo_epi8(b, g); - res_hi_bg = _mm_unpackhi_epi8(b, g); - res_lo_ra = _mm_unpacklo_epi8(r, a); - res_hi_ra = _mm_unpackhi_epi8(r, a); + res_lo_bg = _mm_unpacklo_epi8(b, g); + res_hi_bg = _mm_unpackhi_epi8(b, g); + res_lo_ra = _mm_unpacklo_epi8(r, a); + res_hi_ra = _mm_unpackhi_epi8(r, a); - res_lo = _mm_or_si128(res_lo_bg, + res_lo = _mm_or_si128(res_lo_bg, _mm_slli_si128(res_lo_ra, 2)); - res_hi = _mm_or_si128(res_hi_bg, + res_hi = _mm_or_si128(res_hi_bg, _mm_slli_si128(res_hi_ra, 2)); _mm_storeu_si128((__m128i*)(output + w + 0), res_lo); @@ -243,14 +242,14 @@ void conv_rgb565_argb8888(void *output_, const void *input_, for (; w < width; w++) { uint32_t col = input[w]; - uint32_t r = (col >> 11) & 0x1f; - uint32_t g = (col >> 5) & 0x3f; - uint32_t b = (col >> 0) & 0x1f; - r = (r << 3) | (r >> 2); - g = (g << 2) | (g >> 4); - b = (b << 3) | (b >> 2); + uint32_t r = (col >> 11) & 0x1f; + uint32_t g = (col >> 5) & 0x3f; + uint32_t b = (col >> 0) & 0x1f; + r = (r << 3) | (r >> 2); + g = (g << 2) | (g >> 4); + b = (b << 3) | (b >> 2); - output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); + output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); } } } @@ -269,16 +268,16 @@ void conv_argb8888_rgba4444(void *output_, const void *input_, for (w = 0; w < width; w++) { uint32_t col = input[w]; - uint32_t r = (col >> 16) & 0xf; - uint32_t g = (col >> 8) & 0xf; - uint32_t b = (col) & 0xf; - uint32_t a = (col >> 24) & 0xf; - r = (r >> 4) | r; - g = (g >> 4) | g; - b = (b >> 4) | b; - a = (a >> 4) | a; + uint32_t r = (col >> 16) & 0xf; + uint32_t g = (col >> 8) & 0xf; + uint32_t b = (col) & 0xf; + uint32_t a = (col >> 24) & 0xf; + r = (r >> 4) | r; + g = (g >> 4) | g; + b = (b >> 4) | b; + a = (a >> 4) | a; - output[w] = (r << 12) | (g << 8) | (b << 4) | a; + output[w] = (r << 12) | (g << 8) | (b << 4) | a; } } } @@ -297,16 +296,16 @@ void conv_rgba4444_argb8888(void *output_, const void *input_, for (w = 0; w < width; w++) { uint32_t col = input[w]; - uint32_t r = (col >> 12) & 0xf; - uint32_t g = (col >> 8) & 0xf; - uint32_t b = (col >> 4) & 0xf; - uint32_t a = (col >> 0) & 0xf; - r = (r << 4) | r; - g = (g << 4) | g; - b = (b << 4) | b; - a = (a << 4) | a; + uint32_t r = (col >> 12) & 0xf; + uint32_t g = (col >> 8) & 0xf; + uint32_t b = (col >> 4) & 0xf; + uint32_t a = (col >> 0) & 0xf; + r = (r << 4) | r; + g = (g << 4) | g; + b = (b << 4) | b; + a = (a << 4) | a; - output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0); + output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0); } } } @@ -329,7 +328,7 @@ void conv_rgba4444_rgb565(void *output_, const void *input_, uint32_t g = (col >> 8) & 0xf; uint32_t b = (col >> 4) & 0xf; - output[w] = (r << 12) | (g << 7) | (b << 1); + output[w] = (r << 12) | (g << 7) | (b << 1); } } } @@ -420,32 +419,32 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_, __m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb); __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb); - r0 = _mm_mulhi_epi16(r0, mul15_hi); - r1 = _mm_mulhi_epi16(r1, mul15_hi); - g0 = _mm_mulhi_epi16(g0, mul15_mid); - g1 = _mm_mulhi_epi16(g1, mul15_mid); - b0 = _mm_mulhi_epi16(b0, mul15_mid); - b1 = _mm_mulhi_epi16(b1, mul15_mid); + r0 = _mm_mulhi_epi16(r0, mul15_hi); + r1 = _mm_mulhi_epi16(r1, mul15_hi); + g0 = _mm_mulhi_epi16(g0, mul15_mid); + g1 = _mm_mulhi_epi16(g1, mul15_mid); + b0 = _mm_mulhi_epi16(b0, mul15_mid); + b1 = _mm_mulhi_epi16(b1, mul15_mid); - res_lo_bg0 = _mm_unpacklo_epi8(b0, g0); - res_lo_bg1 = _mm_unpacklo_epi8(b1, g1); - res_hi_bg0 = _mm_unpackhi_epi8(b0, g0); - res_hi_bg1 = _mm_unpackhi_epi8(b1, g1); - res_lo_ra0 = _mm_unpacklo_epi8(r0, a); - res_lo_ra1 = _mm_unpacklo_epi8(r1, a); - res_hi_ra0 = _mm_unpackhi_epi8(r0, a); - res_hi_ra1 = _mm_unpackhi_epi8(r1, a); + res_lo_bg0 = _mm_unpacklo_epi8(b0, g0); + res_lo_bg1 = _mm_unpacklo_epi8(b1, g1); + res_hi_bg0 = _mm_unpackhi_epi8(b0, g0); + res_hi_bg1 = _mm_unpackhi_epi8(b1, g1); + res_lo_ra0 = _mm_unpacklo_epi8(r0, a); + res_lo_ra1 = _mm_unpacklo_epi8(r1, a); + res_hi_ra0 = _mm_unpackhi_epi8(r0, a); + res_hi_ra1 = _mm_unpackhi_epi8(r1, a); - res_lo0 = _mm_or_si128(res_lo_bg0, + res_lo0 = _mm_or_si128(res_lo_bg0, _mm_slli_si128(res_lo_ra0, 2)); - res_lo1 = _mm_or_si128(res_lo_bg1, + res_lo1 = _mm_or_si128(res_lo_bg1, _mm_slli_si128(res_lo_ra1, 2)); - res_hi0 = _mm_or_si128(res_hi_bg0, + res_hi0 = _mm_or_si128(res_hi_bg0, _mm_slli_si128(res_hi_ra0, 2)); - res_hi1 = _mm_or_si128(res_hi_bg1, + res_hi1 = _mm_or_si128(res_hi_bg1, _mm_slli_si128(res_hi_ra1, 2)); - /* Non-POT pixel sizes ftl :( */ + /* Non-POT pixel sizes for the loss */ store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1); } #endif @@ -456,13 +455,13 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_, uint32_t b = (col >> 0) & 0x1f; uint32_t g = (col >> 5) & 0x1f; uint32_t r = (col >> 10) & 0x1f; - b = (b << 3) | (b >> 2); - g = (g << 3) | (g >> 2); - r = (r << 3) | (r >> 2); + b = (b << 3) | (b >> 2); + g = (g << 3) | (g >> 2); + r = (r << 3) | (r >> 2); - *out++ = b; - *out++ = g; - *out++ = r; + *out++ = b; + *out++ = g; + *out++ = r; } } } @@ -506,12 +505,12 @@ void conv_rgb565_bgr24(void *output_, const void *input_, __m128i g1 = _mm_and_si128(in1, pix_mask_g); __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b); - r0 = _mm_mulhi_epi16(r0, mul16_r); - g0 = _mm_mulhi_epi16(g0, mul16_g); - b0 = _mm_mulhi_epi16(b0, mul16_b); - r1 = _mm_mulhi_epi16(r1, mul16_r); - g1 = _mm_mulhi_epi16(g1, mul16_g); - b1 = _mm_mulhi_epi16(b1, mul16_b); + r0 = _mm_mulhi_epi16(r0, mul16_r); + g0 = _mm_mulhi_epi16(g0, mul16_g); + b0 = _mm_mulhi_epi16(b0, mul16_b); + r1 = _mm_mulhi_epi16(r1, mul16_r); + g1 = _mm_mulhi_epi16(g1, mul16_g); + b1 = _mm_mulhi_epi16(b1, mul16_b); res_lo_bg0 = _mm_unpacklo_epi8(b0, g0); res_hi_bg0 = _mm_unpackhi_epi8(b0, g0); @@ -522,13 +521,13 @@ void conv_rgb565_bgr24(void *output_, const void *input_, res_lo_ra1 = _mm_unpacklo_epi8(r1, a); res_hi_ra1 = _mm_unpackhi_epi8(r1, a); - res_lo0 = _mm_or_si128(res_lo_bg0, + res_lo0 = _mm_or_si128(res_lo_bg0, _mm_slli_si128(res_lo_ra0, 2)); - res_hi0 = _mm_or_si128(res_hi_bg0, + res_hi0 = _mm_or_si128(res_hi_bg0, _mm_slli_si128(res_hi_ra0, 2)); - res_lo1 = _mm_or_si128(res_lo_bg1, + res_lo1 = _mm_or_si128(res_lo_bg1, _mm_slli_si128(res_lo_ra1, 2)); - res_hi1 = _mm_or_si128(res_hi_bg1, + res_hi1 = _mm_or_si128(res_hi_bg1, _mm_slli_si128(res_hi_ra1, 2)); store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1); @@ -569,7 +568,7 @@ void conv_bgr24_argb8888(void *output_, const void *input_, uint32_t b = *inp++; uint32_t g = *inp++; uint32_t r = *inp++; - output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); + output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0); } } } @@ -588,10 +587,10 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_, for (w = 0; w < width; w++) { uint32_t col = input[w]; - uint16_t r = (col >> 19) & 0x1f; - uint16_t g = (col >> 11) & 0x1f; - uint16_t b = (col >> 3) & 0x1f; - output[w] = (r << 10) | (g << 5) | (b << 0); + uint16_t r = (col >> 19) & 0x1f; + uint16_t g = (col >> 11) & 0x1f; + uint16_t b = (col >> 3) & 0x1f; + output[w] = (r << 10) | (g << 5) | (b << 0); } } } @@ -627,9 +626,9 @@ void conv_argb8888_bgr24(void *output_, const void *input_, for (; w < width; w++) { uint32_t col = input[w]; - *out++ = (uint8_t)(col >> 0); - *out++ = (uint8_t)(col >> 8); - *out++ = (uint8_t)(col >> 16); + *out++ = (uint8_t)(col >> 0); + *out++ = (uint8_t)(col >> 8); + *out++ = (uint8_t)(col >> 16); } } } @@ -648,7 +647,7 @@ void conv_argb8888_abgr8888(void *output_, const void *input_, for (w = 0; w < width; w++) { uint32_t col = input[w]; - output[w] = ((col << 16) & 0xff0000) | + output[w] = ((col << 16) & 0xff0000) | ((col >> 16) & 0xff) | (col & 0xff00ff00); } } @@ -793,8 +792,8 @@ void conv_yuyv_argb8888(void *output_, const void *input_, uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT); uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT); - dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0); - dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0); + dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0); + dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0); } } } @@ -809,7 +808,7 @@ void conv_copy(void *output_, const void *input_, uint8_t *output = (uint8_t*)output_; if (abs(in_stride) < copy_len) - copy_len = abs(in_stride); + copy_len = abs(in_stride); for (h = 0; h < height; h++, output += out_stride, input += in_stride) diff --git a/libretro-common/gfx/scaler/scaler_filter.c b/libretro-common/gfx/scaler/scaler_filter.c index 48bc23a2a3..56d14304bc 100644 --- a/libretro-common/gfx/scaler/scaler_filter.c +++ b/libretro-common/gfx/scaler/scaler_filter.c @@ -198,8 +198,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx) x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15); y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15); - gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step); - gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step); + gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step); + gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step); ctx->scaler_special = scaler_argb8888_point_special; break; @@ -208,8 +208,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx) x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15); y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15); - gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step); - gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step); + gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step); + gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step); break; case SCALER_TYPE_SINC: @@ -231,7 +231,7 @@ bool scaler_gen_filter(struct scaler_ctx *ctx) /* Makes sure that we never sample outside our rectangle. */ fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width); - fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height); + fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height); return validate_filter(ctx); } diff --git a/libretro-common/gfx/scaler/scaler_int.c b/libretro-common/gfx/scaler/scaler_int.c index 2b189c6dd1..cc96042834 100644 --- a/libretro-common/gfx/scaler/scaler_int.c +++ b/libretro-common/gfx/scaler/scaler_int.c @@ -38,19 +38,28 @@ /* ARGB8888 scaler is split in two: * * First, horizontal scaler is applied. - * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits. - * The sign bit is kept empty as we have to do signed multiplication for the filter. - * A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD. + * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 + * to left to occupy 15 bits. + * + * The sign bit is kept empty as we have to do signed multiplication for the + * filter. + * + * A mulhi [(a * b) >> 16] is applied which loses some precision, but is + * very efficient for SIMD. * It is accurate enough for 8-bit purposes. * - * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept - * with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2. + * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, + * the output is kept with 16-bit channels, and will now have 13 bits + * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2. * - * Vertical scaler takes the 13 bit channels, and performs the same mulhi steps. + * Vertical scaler takes the 13 bit channels, and performs the + * same mulhi steps. * Another 2 bits of precision is lost, which ends up as 11 bits. - * Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values. + * Scaling is now complete. Channels are shifted right by 3, and saturated + * into 8-bit values. * - * The C version of scalers perform the exact same operations as the SIMD code for testing purposes. + * The C version of scalers perform the exact same operations as the + * SIMD code for testing purposes. */ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride) @@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid const int16_t *filter_vert = ctx->vert.filter; - for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2) + for (h = 0; h < ctx->out_height; h++, + filter_vert += ctx->vert.filter_stride, output += stride >> 2) { - const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3); + const uint64_t *input_base = input + ctx->vert.filter_pos[h] + * (ctx->scaled.stride >> 3); for (w = 0; w < ctx->out_width; w++) { @@ -72,12 +83,13 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid __m128i final; __m128i res = _mm_setzero_si128(); - for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2)) + for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, + input_base_y += (ctx->scaled.stride >> 2)) { __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll); __m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]); - res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); + res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); } for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3)) @@ -85,7 +97,7 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll); __m128i col = _mm_set_epi64x(0, input_base_y[0]); - res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); + res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); } res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); @@ -100,53 +112,52 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid int16_t res_g = 0; int16_t res_b = 0; - for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3)) + for (y = 0; y < ctx->vert.filter_len; y++, + input_base_y += (ctx->scaled.stride >> 3)) { - uint64_t col = *input_base_y; + uint64_t col = *input_base_y; - int16_t a = (col >> 48) & 0xffff; - int16_t r = (col >> 32) & 0xffff; - int16_t g = (col >> 16) & 0xffff; - int16_t b = (col >> 0) & 0xffff; + int16_t a = (col >> 48) & 0xffff; + int16_t r = (col >> 32) & 0xffff; + int16_t g = (col >> 16) & 0xffff; + int16_t b = (col >> 0) & 0xffff; - int16_t coeff = filter_vert[y]; + int16_t coeff = filter_vert[y]; - res_a += (a * coeff) >> 16; - res_r += (r * coeff) >> 16; - res_g += (g * coeff) >> 16; - res_b += (b * coeff) >> 16; + res_a += (a * coeff) >> 16; + res_r += (r * coeff) >> 16; + res_g += (g * coeff) >> 16; + res_b += (b * coeff) >> 16; } - res_a >>= (7 - 2 - 2); - res_r >>= (7 - 2 - 2); - res_g >>= (7 - 2 - 2); - res_b >>= (7 - 2 - 2); + res_a >>= (7 - 2 - 2); + res_r >>= (7 - 2 - 2); + res_g >>= (7 - 2 - 2); + res_b >>= (7 - 2 - 2); - output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | - (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0); + output[w] = + (clamp_8bit(res_a) << 24) | + (clamp_8bit(res_r) << 16) | + (clamp_8bit(res_g) << 8) | + (clamp_8bit(res_b) << 0); #endif } } } -#if !defined(__SSE2__) -static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b) -{ - return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0); -} -#endif - void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride) { int h, w, x; const uint32_t *input = (uint32_t*)input_; uint64_t *output = ctx->scaled.frame; - for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3) + for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, + output += ctx->scaled.stride >> 3) { const int16_t *filter_horiz = ctx->horiz.filter; - for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride) + for (w = 0; w < ctx->scaled.width; w++, + filter_horiz += ctx->horiz.filter_stride) { const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w]; #if defined(__SSE2__) @@ -156,11 +167,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int { __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll); - __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0, + __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0, ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128()); - col = _mm_slli_epi16(col, 7); - res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); + col = _mm_slli_epi16(col, 7); + res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); } for (; x < ctx->horiz.filter_len; x++) @@ -168,14 +179,14 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll); __m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128()); - col = _mm_slli_epi16(col, 7); - res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); + col = _mm_slli_epi16(col, 7); + res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res); } - res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); + res = _mm_adds_epi16(_mm_srli_si128(res, 8), res); #ifdef __x86_64__ - output[w] = _mm_cvtsi128_si64(res); + output[w] = _mm_cvtsi128_si64(res); #else /* 32-bit doesn't have si64. Do it in two steps. */ union { @@ -194,22 +205,26 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int for (x = 0; x < ctx->horiz.filter_len; x++) { - uint32_t col = input_base_x[x]; + uint32_t col = input_base_x[x]; - int16_t a = (col >> (24 - 7)) & (0xff << 7); - int16_t r = (col >> (16 - 7)) & (0xff << 7); - int16_t g = (col >> ( 8 - 7)) & (0xff << 7); - int16_t b = (col << ( 0 + 7)) & (0xff << 7); + int16_t a = (col >> (24 - 7)) & (0xff << 7); + int16_t r = (col >> (16 - 7)) & (0xff << 7); + int16_t g = (col >> ( 8 - 7)) & (0xff << 7); + int16_t b = (col << ( 0 + 7)) & (0xff << 7); - int16_t coeff = filter_horiz[x]; + int16_t coeff = filter_horiz[x]; - res_a += (a * coeff) >> 16; - res_r += (r * coeff) >> 16; - res_g += (g * coeff) >> 16; - res_b += (b * coeff) >> 16; + res_a += (a * coeff) >> 16; + res_r += (r * coeff) >> 16; + res_g += (g * coeff) >> 16; + res_b += (b * coeff) >> 16; } - output[w] = build_argb64(res_a, res_r, res_g, res_b); + output[w] = ( + (uint64_t)res_a << 48) | + ((uint64_t)res_r << 32) | + ((uint64_t)res_g << 16) | + ((uint64_t)res_b << 0); #endif } }