From 1922d898afff0ef9fe8dbfa083709fb28db6f2e8 Mon Sep 17 00:00:00 2001 From: Brad Parker Date: Sat, 16 Feb 2019 12:51:07 -0500 Subject: [PATCH] pixconv: add MMX version for conv_rgb565_argb8888 --- libretro-common/gfx/scaler/pixconv.c | 49 +++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/libretro-common/gfx/scaler/pixconv.c b/libretro-common/gfx/scaler/pixconv.c index c26e95c26d..46b2a33e8e 100644 --- a/libretro-common/gfx/scaler/pixconv.c +++ b/libretro-common/gfx/scaler/pixconv.c @@ -35,6 +35,8 @@ #if defined(__SSE2__) #include +#elif defined(__MMX__) +#include #endif void conv_rgb565_0rgb1555(void *output_, const void *input_, @@ -204,6 +206,16 @@ void conv_rgb565_argb8888(void *output_, const void *input_, const __m128i a = _mm_set1_epi16(0x00ff); int max_width = width - 7; +#elif defined(__MMX__) + const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10); + const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5); + const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5); + const __m64 mul16_r = _mm_set1_pi16(0x0210); + const __m64 mul16_g = _mm_set1_pi16(0x2080); + const __m64 mul16_b = _mm_set1_pi16(0x4200); + const __m64 a = _mm_set1_pi16(0x00ff); + + int max_width = width - 3; #endif for (h = 0; h < height; @@ -237,6 +249,35 @@ void conv_rgb565_argb8888(void *output_, const void *input_, _mm_storeu_si128((__m128i*)(output + w + 0), res_lo); _mm_storeu_si128((__m128i*)(output + w + 4), res_hi); } +#elif defined(__MMX__) + for (; w < max_width; w += 4) + { + __m64 res_lo, res_hi; + __m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra; + const __m64 in = _mm_cvtsi64_m64(*((int64_t*)(input + w))); + __m64 r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r); + __m64 g = _mm_and_si64(in, pix_mask_g); + __m64 b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b); + + r = _mm_mulhi_pi16(r, mul16_r); + g = _mm_mulhi_pi16(g, mul16_g); + b = _mm_mulhi_pi16(b, mul16_b); + + res_lo_bg = _mm_unpacklo_pi8(b, g); + res_hi_bg = _mm_unpackhi_pi8(b, g); + res_lo_ra = _mm_unpacklo_pi8(r, a); + res_hi_ra = _mm_unpackhi_pi8(r, a); + + res_lo = _mm_or_si64(res_lo_bg, + _mm_slli_si64(res_lo_ra, 16)); + res_hi = _mm_or_si64(res_hi_bg, + _mm_slli_si64(res_hi_ra, 16)); + + *((int64_t*)(output + w + 0)) = _mm_cvtm64_si64(res_lo); + *((int64_t*)(output + w + 2)) = _mm_cvtm64_si64(res_hi); + } + + _mm_empty(); #endif for (; w < width; w++) @@ -284,18 +325,18 @@ void conv_rgb565_abgr8888(void *output_, const void *input_, __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r); __m128i g = _mm_and_si128(in, pix_mask_g); __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b); - r = _mm_mulhi_epi16(r, mul16_r); + r = _mm_mulhi_epi16(r, mul16_r); g = _mm_mulhi_epi16(g, mul16_g); b = _mm_mulhi_epi16(b, mul16_b); - res_lo_bg = _mm_unpacklo_epi8(b, g); + res_lo_bg = _mm_unpacklo_epi8(b, g); res_hi_bg = _mm_unpackhi_epi8(b, g); res_lo_ra = _mm_unpacklo_epi8(r, a); res_hi_ra = _mm_unpackhi_epi8(r, a); - res_lo = _mm_or_si128(res_lo_bg, + res_lo = _mm_or_si128(res_lo_bg, _mm_slli_si128(res_lo_ra, 2)); res_hi = _mm_or_si128(res_hi_bg, _mm_slli_si128(res_hi_ra, 2)); - _mm_storeu_si128((__m128i*)(output + w + 0), res_lo); + _mm_storeu_si128((__m128i*)(output + w + 0), res_lo); _mm_storeu_si128((__m128i*)(output + w + 4), res_hi); } #endif