pixconv: add MMX version for conv_rgb565_argb8888

This commit is contained in:
Brad Parker 2019-02-16 12:51:07 -05:00
parent b617bb5d64
commit 1922d898af

View File

@ -35,6 +35,8 @@
#if defined(__SSE2__)
#include <emmintrin.h>
#elif defined(__MMX__)
#include <mmintrin.h>
#endif
void conv_rgb565_0rgb1555(void *output_, const void *input_,
@ -204,6 +206,16 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
const __m128i a = _mm_set1_epi16(0x00ff);
int max_width = width - 7;
#elif defined(__MMX__)
const __m64 pix_mask_r = _mm_set1_pi16(0x1f << 10);
const __m64 pix_mask_g = _mm_set1_pi16(0x3f << 5);
const __m64 pix_mask_b = _mm_set1_pi16(0x1f << 5);
const __m64 mul16_r = _mm_set1_pi16(0x0210);
const __m64 mul16_g = _mm_set1_pi16(0x2080);
const __m64 mul16_b = _mm_set1_pi16(0x4200);
const __m64 a = _mm_set1_pi16(0x00ff);
int max_width = width - 3;
#endif
for (h = 0; h < height;
@ -237,6 +249,35 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
_mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
}
#elif defined(__MMX__)
for (; w < max_width; w += 4)
{
__m64 res_lo, res_hi;
__m64 res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
const __m64 in = _mm_cvtsi64_m64(*((int64_t*)(input + w)));
__m64 r = _mm_and_si64(_mm_srli_pi16(in, 1), pix_mask_r);
__m64 g = _mm_and_si64(in, pix_mask_g);
__m64 b = _mm_and_si64(_mm_slli_pi16(in, 5), pix_mask_b);
r = _mm_mulhi_pi16(r, mul16_r);
g = _mm_mulhi_pi16(g, mul16_g);
b = _mm_mulhi_pi16(b, mul16_b);
res_lo_bg = _mm_unpacklo_pi8(b, g);
res_hi_bg = _mm_unpackhi_pi8(b, g);
res_lo_ra = _mm_unpacklo_pi8(r, a);
res_hi_ra = _mm_unpackhi_pi8(r, a);
res_lo = _mm_or_si64(res_lo_bg,
_mm_slli_si64(res_lo_ra, 16));
res_hi = _mm_or_si64(res_hi_bg,
_mm_slli_si64(res_hi_ra, 16));
*((int64_t*)(output + w + 0)) = _mm_cvtm64_si64(res_lo);
*((int64_t*)(output + w + 2)) = _mm_cvtm64_si64(res_hi);
}
_mm_empty();
#endif
for (; w < width; w++)
@ -284,18 +325,18 @@ void conv_rgb565_abgr8888(void *output_, const void *input_,
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
__m128i g = _mm_and_si128(in, pix_mask_g);
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
r = _mm_mulhi_epi16(r, mul16_r);
r = _mm_mulhi_epi16(r, mul16_r);
g = _mm_mulhi_epi16(g, mul16_g);
b = _mm_mulhi_epi16(b, mul16_b);
res_lo_bg = _mm_unpacklo_epi8(b, g);
res_lo_bg = _mm_unpacklo_epi8(b, g);
res_hi_bg = _mm_unpackhi_epi8(b, g);
res_lo_ra = _mm_unpacklo_epi8(r, a);
res_hi_ra = _mm_unpackhi_epi8(r, a);
res_lo = _mm_or_si128(res_lo_bg,
res_lo = _mm_or_si128(res_lo_bg,
_mm_slli_si128(res_lo_ra, 2));
res_hi = _mm_or_si128(res_hi_bg,
_mm_slli_si128(res_hi_ra, 2));
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
_mm_storeu_si128((__m128i*)(output + w + 4), res_hi);
}
#endif