mirror of
https://github.com/libretro/RetroArch
synced 2025-02-15 18:39:55 +00:00
(gfx/scaler) Cleanups
This commit is contained in:
parent
f21bb4d0dc
commit
5b9a17dc8f
@ -46,8 +46,7 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_,
|
|||||||
uint16_t *output = (uint16_t*)output_;
|
uint16_t *output = (uint16_t*)output_;
|
||||||
|
|
||||||
#if defined(__SSE2_)
|
#if defined(__SSE2_)
|
||||||
int max_width = width - 7;
|
int max_width = width - 7;
|
||||||
|
|
||||||
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
|
const __m128i hi_mask = _mm_set1_epi16(0x7fe0);
|
||||||
const __m128i lo_mask = _mm_set1_epi16(0x1f);
|
const __m128i lo_mask = _mm_set1_epi16(0x1f);
|
||||||
#endif
|
#endif
|
||||||
@ -115,7 +114,7 @@ void conv_0rgb1555_rgb565(void *output_, const void *input_,
|
|||||||
uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
|
uint16_t rg = (col << 1) & ((0x1f << 11) | (0x1f << 6));
|
||||||
uint16_t b = col & 0x1f;
|
uint16_t b = col & 0x1f;
|
||||||
uint16_t glow = (col >> 4) & (1 << 5);
|
uint16_t glow = (col >> 4) & (1 << 5);
|
||||||
output[w] = rg | b | glow;
|
output[w] = rg | b | glow;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -175,14 +174,14 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
|
|||||||
for (; w < width; w++)
|
for (; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
uint32_t r = (col >> 10) & 0x1f;
|
uint32_t r = (col >> 10) & 0x1f;
|
||||||
uint32_t g = (col >> 5) & 0x1f;
|
uint32_t g = (col >> 5) & 0x1f;
|
||||||
uint32_t b = (col >> 0) & 0x1f;
|
uint32_t b = (col >> 0) & 0x1f;
|
||||||
r = (r << 3) | (r >> 2);
|
r = (r << 3) | (r >> 2);
|
||||||
g = (g << 3) | (g >> 2);
|
g = (g << 3) | (g >> 2);
|
||||||
b = (b << 3) | (b >> 2);
|
b = (b << 3) | (b >> 2);
|
||||||
|
|
||||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -217,22 +216,22 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
|
|||||||
__m128i res_lo, res_hi;
|
__m128i res_lo, res_hi;
|
||||||
__m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
|
__m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
|
||||||
const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
|
const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
|
||||||
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
|
__m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
|
||||||
__m128i g = _mm_and_si128(in, pix_mask_g);
|
__m128i g = _mm_and_si128(in, pix_mask_g);
|
||||||
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
|
__m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
|
||||||
|
|
||||||
r = _mm_mulhi_epi16(r, mul16_r);
|
r = _mm_mulhi_epi16(r, mul16_r);
|
||||||
g = _mm_mulhi_epi16(g, mul16_g);
|
g = _mm_mulhi_epi16(g, mul16_g);
|
||||||
b = _mm_mulhi_epi16(b, mul16_b);
|
b = _mm_mulhi_epi16(b, mul16_b);
|
||||||
|
|
||||||
res_lo_bg = _mm_unpacklo_epi8(b, g);
|
res_lo_bg = _mm_unpacklo_epi8(b, g);
|
||||||
res_hi_bg = _mm_unpackhi_epi8(b, g);
|
res_hi_bg = _mm_unpackhi_epi8(b, g);
|
||||||
res_lo_ra = _mm_unpacklo_epi8(r, a);
|
res_lo_ra = _mm_unpacklo_epi8(r, a);
|
||||||
res_hi_ra = _mm_unpackhi_epi8(r, a);
|
res_hi_ra = _mm_unpackhi_epi8(r, a);
|
||||||
|
|
||||||
res_lo = _mm_or_si128(res_lo_bg,
|
res_lo = _mm_or_si128(res_lo_bg,
|
||||||
_mm_slli_si128(res_lo_ra, 2));
|
_mm_slli_si128(res_lo_ra, 2));
|
||||||
res_hi = _mm_or_si128(res_hi_bg,
|
res_hi = _mm_or_si128(res_hi_bg,
|
||||||
_mm_slli_si128(res_hi_ra, 2));
|
_mm_slli_si128(res_hi_ra, 2));
|
||||||
|
|
||||||
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
|
_mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
|
||||||
@ -243,14 +242,14 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
|
|||||||
for (; w < width; w++)
|
for (; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
uint32_t r = (col >> 11) & 0x1f;
|
uint32_t r = (col >> 11) & 0x1f;
|
||||||
uint32_t g = (col >> 5) & 0x3f;
|
uint32_t g = (col >> 5) & 0x3f;
|
||||||
uint32_t b = (col >> 0) & 0x1f;
|
uint32_t b = (col >> 0) & 0x1f;
|
||||||
r = (r << 3) | (r >> 2);
|
r = (r << 3) | (r >> 2);
|
||||||
g = (g << 2) | (g >> 4);
|
g = (g << 2) | (g >> 4);
|
||||||
b = (b << 3) | (b >> 2);
|
b = (b << 3) | (b >> 2);
|
||||||
|
|
||||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -269,16 +268,16 @@ void conv_argb8888_rgba4444(void *output_, const void *input_,
|
|||||||
for (w = 0; w < width; w++)
|
for (w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
uint32_t r = (col >> 16) & 0xf;
|
uint32_t r = (col >> 16) & 0xf;
|
||||||
uint32_t g = (col >> 8) & 0xf;
|
uint32_t g = (col >> 8) & 0xf;
|
||||||
uint32_t b = (col) & 0xf;
|
uint32_t b = (col) & 0xf;
|
||||||
uint32_t a = (col >> 24) & 0xf;
|
uint32_t a = (col >> 24) & 0xf;
|
||||||
r = (r >> 4) | r;
|
r = (r >> 4) | r;
|
||||||
g = (g >> 4) | g;
|
g = (g >> 4) | g;
|
||||||
b = (b >> 4) | b;
|
b = (b >> 4) | b;
|
||||||
a = (a >> 4) | a;
|
a = (a >> 4) | a;
|
||||||
|
|
||||||
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
|
output[w] = (r << 12) | (g << 8) | (b << 4) | a;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -297,16 +296,16 @@ void conv_rgba4444_argb8888(void *output_, const void *input_,
|
|||||||
for (w = 0; w < width; w++)
|
for (w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
uint32_t r = (col >> 12) & 0xf;
|
uint32_t r = (col >> 12) & 0xf;
|
||||||
uint32_t g = (col >> 8) & 0xf;
|
uint32_t g = (col >> 8) & 0xf;
|
||||||
uint32_t b = (col >> 4) & 0xf;
|
uint32_t b = (col >> 4) & 0xf;
|
||||||
uint32_t a = (col >> 0) & 0xf;
|
uint32_t a = (col >> 0) & 0xf;
|
||||||
r = (r << 4) | r;
|
r = (r << 4) | r;
|
||||||
g = (g << 4) | g;
|
g = (g << 4) | g;
|
||||||
b = (b << 4) | b;
|
b = (b << 4) | b;
|
||||||
a = (a << 4) | a;
|
a = (a << 4) | a;
|
||||||
|
|
||||||
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
|
output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -329,7 +328,7 @@ void conv_rgba4444_rgb565(void *output_, const void *input_,
|
|||||||
uint32_t g = (col >> 8) & 0xf;
|
uint32_t g = (col >> 8) & 0xf;
|
||||||
uint32_t b = (col >> 4) & 0xf;
|
uint32_t b = (col >> 4) & 0xf;
|
||||||
|
|
||||||
output[w] = (r << 12) | (g << 7) | (b << 1);
|
output[w] = (r << 12) | (g << 7) | (b << 1);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -420,32 +419,32 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
|||||||
__m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
|
__m128i b0 = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
|
||||||
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
|
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
|
||||||
|
|
||||||
r0 = _mm_mulhi_epi16(r0, mul15_hi);
|
r0 = _mm_mulhi_epi16(r0, mul15_hi);
|
||||||
r1 = _mm_mulhi_epi16(r1, mul15_hi);
|
r1 = _mm_mulhi_epi16(r1, mul15_hi);
|
||||||
g0 = _mm_mulhi_epi16(g0, mul15_mid);
|
g0 = _mm_mulhi_epi16(g0, mul15_mid);
|
||||||
g1 = _mm_mulhi_epi16(g1, mul15_mid);
|
g1 = _mm_mulhi_epi16(g1, mul15_mid);
|
||||||
b0 = _mm_mulhi_epi16(b0, mul15_mid);
|
b0 = _mm_mulhi_epi16(b0, mul15_mid);
|
||||||
b1 = _mm_mulhi_epi16(b1, mul15_mid);
|
b1 = _mm_mulhi_epi16(b1, mul15_mid);
|
||||||
|
|
||||||
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||||
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
|
res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
|
||||||
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||||
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
|
res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
|
||||||
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
|
res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
|
||||||
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||||
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
|
res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
|
||||||
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||||
|
|
||||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||||
_mm_slli_si128(res_lo_ra0, 2));
|
_mm_slli_si128(res_lo_ra0, 2));
|
||||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||||
_mm_slli_si128(res_lo_ra1, 2));
|
_mm_slli_si128(res_lo_ra1, 2));
|
||||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||||
_mm_slli_si128(res_hi_ra0, 2));
|
_mm_slli_si128(res_hi_ra0, 2));
|
||||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||||
_mm_slli_si128(res_hi_ra1, 2));
|
_mm_slli_si128(res_hi_ra1, 2));
|
||||||
|
|
||||||
/* Non-POT pixel sizes ftl :( */
|
/* Non-POT pixel sizes for the loss */
|
||||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
@ -456,13 +455,13 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
|
|||||||
uint32_t b = (col >> 0) & 0x1f;
|
uint32_t b = (col >> 0) & 0x1f;
|
||||||
uint32_t g = (col >> 5) & 0x1f;
|
uint32_t g = (col >> 5) & 0x1f;
|
||||||
uint32_t r = (col >> 10) & 0x1f;
|
uint32_t r = (col >> 10) & 0x1f;
|
||||||
b = (b << 3) | (b >> 2);
|
b = (b << 3) | (b >> 2);
|
||||||
g = (g << 3) | (g >> 2);
|
g = (g << 3) | (g >> 2);
|
||||||
r = (r << 3) | (r >> 2);
|
r = (r << 3) | (r >> 2);
|
||||||
|
|
||||||
*out++ = b;
|
*out++ = b;
|
||||||
*out++ = g;
|
*out++ = g;
|
||||||
*out++ = r;
|
*out++ = r;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -506,12 +505,12 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
|
|||||||
__m128i g1 = _mm_and_si128(in1, pix_mask_g);
|
__m128i g1 = _mm_and_si128(in1, pix_mask_g);
|
||||||
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
|
__m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
|
||||||
|
|
||||||
r0 = _mm_mulhi_epi16(r0, mul16_r);
|
r0 = _mm_mulhi_epi16(r0, mul16_r);
|
||||||
g0 = _mm_mulhi_epi16(g0, mul16_g);
|
g0 = _mm_mulhi_epi16(g0, mul16_g);
|
||||||
b0 = _mm_mulhi_epi16(b0, mul16_b);
|
b0 = _mm_mulhi_epi16(b0, mul16_b);
|
||||||
r1 = _mm_mulhi_epi16(r1, mul16_r);
|
r1 = _mm_mulhi_epi16(r1, mul16_r);
|
||||||
g1 = _mm_mulhi_epi16(g1, mul16_g);
|
g1 = _mm_mulhi_epi16(g1, mul16_g);
|
||||||
b1 = _mm_mulhi_epi16(b1, mul16_b);
|
b1 = _mm_mulhi_epi16(b1, mul16_b);
|
||||||
|
|
||||||
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
|
||||||
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
|
||||||
@ -522,13 +521,13 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
|
|||||||
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
|
||||||
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
|
||||||
|
|
||||||
res_lo0 = _mm_or_si128(res_lo_bg0,
|
res_lo0 = _mm_or_si128(res_lo_bg0,
|
||||||
_mm_slli_si128(res_lo_ra0, 2));
|
_mm_slli_si128(res_lo_ra0, 2));
|
||||||
res_hi0 = _mm_or_si128(res_hi_bg0,
|
res_hi0 = _mm_or_si128(res_hi_bg0,
|
||||||
_mm_slli_si128(res_hi_ra0, 2));
|
_mm_slli_si128(res_hi_ra0, 2));
|
||||||
res_lo1 = _mm_or_si128(res_lo_bg1,
|
res_lo1 = _mm_or_si128(res_lo_bg1,
|
||||||
_mm_slli_si128(res_lo_ra1, 2));
|
_mm_slli_si128(res_lo_ra1, 2));
|
||||||
res_hi1 = _mm_or_si128(res_hi_bg1,
|
res_hi1 = _mm_or_si128(res_hi_bg1,
|
||||||
_mm_slli_si128(res_hi_ra1, 2));
|
_mm_slli_si128(res_hi_ra1, 2));
|
||||||
|
|
||||||
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
|
||||||
@ -569,7 +568,7 @@ void conv_bgr24_argb8888(void *output_, const void *input_,
|
|||||||
uint32_t b = *inp++;
|
uint32_t b = *inp++;
|
||||||
uint32_t g = *inp++;
|
uint32_t g = *inp++;
|
||||||
uint32_t r = *inp++;
|
uint32_t r = *inp++;
|
||||||
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -588,10 +587,10 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_,
|
|||||||
for (w = 0; w < width; w++)
|
for (w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
uint16_t r = (col >> 19) & 0x1f;
|
uint16_t r = (col >> 19) & 0x1f;
|
||||||
uint16_t g = (col >> 11) & 0x1f;
|
uint16_t g = (col >> 11) & 0x1f;
|
||||||
uint16_t b = (col >> 3) & 0x1f;
|
uint16_t b = (col >> 3) & 0x1f;
|
||||||
output[w] = (r << 10) | (g << 5) | (b << 0);
|
output[w] = (r << 10) | (g << 5) | (b << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -627,9 +626,9 @@ void conv_argb8888_bgr24(void *output_, const void *input_,
|
|||||||
for (; w < width; w++)
|
for (; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
*out++ = (uint8_t)(col >> 0);
|
*out++ = (uint8_t)(col >> 0);
|
||||||
*out++ = (uint8_t)(col >> 8);
|
*out++ = (uint8_t)(col >> 8);
|
||||||
*out++ = (uint8_t)(col >> 16);
|
*out++ = (uint8_t)(col >> 16);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -648,7 +647,7 @@ void conv_argb8888_abgr8888(void *output_, const void *input_,
|
|||||||
for (w = 0; w < width; w++)
|
for (w = 0; w < width; w++)
|
||||||
{
|
{
|
||||||
uint32_t col = input[w];
|
uint32_t col = input[w];
|
||||||
output[w] = ((col << 16) & 0xff0000) |
|
output[w] = ((col << 16) & 0xff0000) |
|
||||||
((col >> 16) & 0xff) | (col & 0xff00ff00);
|
((col >> 16) & 0xff) | (col & 0xff00ff00);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -793,8 +792,8 @@ void conv_yuyv_argb8888(void *output_, const void *input_,
|
|||||||
uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
|
||||||
uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u + YUV_OFFSET) >> YUV_SHIFT);
|
||||||
|
|
||||||
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
|
||||||
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -809,7 +808,7 @@ void conv_copy(void *output_, const void *input_,
|
|||||||
uint8_t *output = (uint8_t*)output_;
|
uint8_t *output = (uint8_t*)output_;
|
||||||
|
|
||||||
if (abs(in_stride) < copy_len)
|
if (abs(in_stride) < copy_len)
|
||||||
copy_len = abs(in_stride);
|
copy_len = abs(in_stride);
|
||||||
|
|
||||||
for (h = 0; h < height;
|
for (h = 0; h < height;
|
||||||
h++, output += out_stride, input += in_stride)
|
h++, output += out_stride, input += in_stride)
|
||||||
|
@ -198,8 +198,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
|||||||
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
||||||
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
||||||
|
|
||||||
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||||
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||||
|
|
||||||
ctx->scaler_special = scaler_argb8888_point_special;
|
ctx->scaler_special = scaler_argb8888_point_special;
|
||||||
break;
|
break;
|
||||||
@ -208,8 +208,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
|||||||
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
x_pos = (1 << 15) * ctx->in_width / ctx->out_width - (1 << 15);
|
||||||
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
y_pos = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
|
||||||
|
|
||||||
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
|
||||||
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
|
||||||
break;
|
break;
|
||||||
|
|
||||||
case SCALER_TYPE_SINC:
|
case SCALER_TYPE_SINC:
|
||||||
@ -231,7 +231,7 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
|
|||||||
|
|
||||||
/* Makes sure that we never sample outside our rectangle. */
|
/* Makes sure that we never sample outside our rectangle. */
|
||||||
fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
|
fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
|
||||||
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
|
fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
|
||||||
|
|
||||||
return validate_filter(ctx);
|
return validate_filter(ctx);
|
||||||
}
|
}
|
||||||
|
@ -38,19 +38,28 @@
|
|||||||
/* ARGB8888 scaler is split in two:
|
/* ARGB8888 scaler is split in two:
|
||||||
*
|
*
|
||||||
* First, horizontal scaler is applied.
|
* First, horizontal scaler is applied.
|
||||||
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
|
* Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7
|
||||||
* The sign bit is kept empty as we have to do signed multiplication for the filter.
|
* to left to occupy 15 bits.
|
||||||
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
|
*
|
||||||
|
* The sign bit is kept empty as we have to do signed multiplication for the
|
||||||
|
* filter.
|
||||||
|
*
|
||||||
|
* A mulhi [(a * b) >> 16] is applied which loses some precision, but is
|
||||||
|
* very efficient for SIMD.
|
||||||
* It is accurate enough for 8-bit purposes.
|
* It is accurate enough for 8-bit purposes.
|
||||||
*
|
*
|
||||||
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
|
* The fixed point 1.0 for filter is (1 << 14). After horizontal scale,
|
||||||
* with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
* the output is kept with 16-bit channels, and will now have 13 bits
|
||||||
|
* of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
|
||||||
*
|
*
|
||||||
* Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
|
* Vertical scaler takes the 13 bit channels, and performs the
|
||||||
|
* same mulhi steps.
|
||||||
* Another 2 bits of precision is lost, which ends up as 11 bits.
|
* Another 2 bits of precision is lost, which ends up as 11 bits.
|
||||||
* Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
|
* Scaling is now complete. Channels are shifted right by 3, and saturated
|
||||||
|
* into 8-bit values.
|
||||||
*
|
*
|
||||||
* The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
|
* The C version of scalers perform the exact same operations as the
|
||||||
|
* SIMD code for testing purposes.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
|
void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
|
||||||
@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
|
|
||||||
const int16_t *filter_vert = ctx->vert.filter;
|
const int16_t *filter_vert = ctx->vert.filter;
|
||||||
|
|
||||||
for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
for (h = 0; h < ctx->out_height; h++,
|
||||||
|
filter_vert += ctx->vert.filter_stride, output += stride >> 2)
|
||||||
{
|
{
|
||||||
const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
|
const uint64_t *input_base = input + ctx->vert.filter_pos[h]
|
||||||
|
* (ctx->scaled.stride >> 3);
|
||||||
|
|
||||||
for (w = 0; w < ctx->out_width; w++)
|
for (w = 0; w < ctx->out_width; w++)
|
||||||
{
|
{
|
||||||
@ -72,12 +83,13 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
__m128i final;
|
__m128i final;
|
||||||
__m128i res = _mm_setzero_si128();
|
__m128i res = _mm_setzero_si128();
|
||||||
|
|
||||||
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
|
for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
|
||||||
|
input_base_y += (ctx->scaled.stride >> 2))
|
||||||
{
|
{
|
||||||
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
|
__m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
|
||||||
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
|
__m128i col = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
|
||||||
|
|
||||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
||||||
@ -85,7 +97,7 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
|
__m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
|
||||||
__m128i col = _mm_set_epi64x(0, input_base_y[0]);
|
__m128i col = _mm_set_epi64x(0, input_base_y[0]);
|
||||||
|
|
||||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
||||||
@ -100,53 +112,52 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
|
|||||||
int16_t res_g = 0;
|
int16_t res_g = 0;
|
||||||
int16_t res_b = 0;
|
int16_t res_b = 0;
|
||||||
|
|
||||||
for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
|
for (y = 0; y < ctx->vert.filter_len; y++,
|
||||||
|
input_base_y += (ctx->scaled.stride >> 3))
|
||||||
{
|
{
|
||||||
uint64_t col = *input_base_y;
|
uint64_t col = *input_base_y;
|
||||||
|
|
||||||
int16_t a = (col >> 48) & 0xffff;
|
int16_t a = (col >> 48) & 0xffff;
|
||||||
int16_t r = (col >> 32) & 0xffff;
|
int16_t r = (col >> 32) & 0xffff;
|
||||||
int16_t g = (col >> 16) & 0xffff;
|
int16_t g = (col >> 16) & 0xffff;
|
||||||
int16_t b = (col >> 0) & 0xffff;
|
int16_t b = (col >> 0) & 0xffff;
|
||||||
|
|
||||||
int16_t coeff = filter_vert[y];
|
int16_t coeff = filter_vert[y];
|
||||||
|
|
||||||
res_a += (a * coeff) >> 16;
|
res_a += (a * coeff) >> 16;
|
||||||
res_r += (r * coeff) >> 16;
|
res_r += (r * coeff) >> 16;
|
||||||
res_g += (g * coeff) >> 16;
|
res_g += (g * coeff) >> 16;
|
||||||
res_b += (b * coeff) >> 16;
|
res_b += (b * coeff) >> 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
res_a >>= (7 - 2 - 2);
|
res_a >>= (7 - 2 - 2);
|
||||||
res_r >>= (7 - 2 - 2);
|
res_r >>= (7 - 2 - 2);
|
||||||
res_g >>= (7 - 2 - 2);
|
res_g >>= (7 - 2 - 2);
|
||||||
res_b >>= (7 - 2 - 2);
|
res_b >>= (7 - 2 - 2);
|
||||||
|
|
||||||
output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) |
|
output[w] =
|
||||||
(clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
|
(clamp_8bit(res_a) << 24) |
|
||||||
|
(clamp_8bit(res_r) << 16) |
|
||||||
|
(clamp_8bit(res_g) << 8) |
|
||||||
|
(clamp_8bit(res_b) << 0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !defined(__SSE2__)
|
|
||||||
static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
|
|
||||||
{
|
|
||||||
return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
|
void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
|
||||||
{
|
{
|
||||||
int h, w, x;
|
int h, w, x;
|
||||||
const uint32_t *input = (uint32_t*)input_;
|
const uint32_t *input = (uint32_t*)input_;
|
||||||
uint64_t *output = ctx->scaled.frame;
|
uint64_t *output = ctx->scaled.frame;
|
||||||
|
|
||||||
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
|
for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
|
||||||
|
output += ctx->scaled.stride >> 3)
|
||||||
{
|
{
|
||||||
const int16_t *filter_horiz = ctx->horiz.filter;
|
const int16_t *filter_horiz = ctx->horiz.filter;
|
||||||
|
|
||||||
for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
|
for (w = 0; w < ctx->scaled.width; w++,
|
||||||
|
filter_horiz += ctx->horiz.filter_stride)
|
||||||
{
|
{
|
||||||
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
|
const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
|
||||||
#if defined(__SSE2__)
|
#if defined(__SSE2__)
|
||||||
@ -156,11 +167,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
|||||||
{
|
{
|
||||||
__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
|
__m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
|
||||||
|
|
||||||
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
|
__m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
|
||||||
((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
|
((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
|
||||||
|
|
||||||
col = _mm_slli_epi16(col, 7);
|
col = _mm_slli_epi16(col, 7);
|
||||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
for (; x < ctx->horiz.filter_len; x++)
|
for (; x < ctx->horiz.filter_len; x++)
|
||||||
@ -168,14 +179,14 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
|||||||
__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
|
__m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
|
||||||
__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
|
__m128i col = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
|
||||||
|
|
||||||
col = _mm_slli_epi16(col, 7);
|
col = _mm_slli_epi16(col, 7);
|
||||||
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
|
||||||
}
|
}
|
||||||
|
|
||||||
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
res = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
|
||||||
|
|
||||||
#ifdef __x86_64__
|
#ifdef __x86_64__
|
||||||
output[w] = _mm_cvtsi128_si64(res);
|
output[w] = _mm_cvtsi128_si64(res);
|
||||||
#else /* 32-bit doesn't have si64. Do it in two steps. */
|
#else /* 32-bit doesn't have si64. Do it in two steps. */
|
||||||
union
|
union
|
||||||
{
|
{
|
||||||
@ -194,22 +205,26 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
|
|||||||
|
|
||||||
for (x = 0; x < ctx->horiz.filter_len; x++)
|
for (x = 0; x < ctx->horiz.filter_len; x++)
|
||||||
{
|
{
|
||||||
uint32_t col = input_base_x[x];
|
uint32_t col = input_base_x[x];
|
||||||
|
|
||||||
int16_t a = (col >> (24 - 7)) & (0xff << 7);
|
int16_t a = (col >> (24 - 7)) & (0xff << 7);
|
||||||
int16_t r = (col >> (16 - 7)) & (0xff << 7);
|
int16_t r = (col >> (16 - 7)) & (0xff << 7);
|
||||||
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
|
int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
|
||||||
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
|
int16_t b = (col << ( 0 + 7)) & (0xff << 7);
|
||||||
|
|
||||||
int16_t coeff = filter_horiz[x];
|
int16_t coeff = filter_horiz[x];
|
||||||
|
|
||||||
res_a += (a * coeff) >> 16;
|
res_a += (a * coeff) >> 16;
|
||||||
res_r += (r * coeff) >> 16;
|
res_r += (r * coeff) >> 16;
|
||||||
res_g += (g * coeff) >> 16;
|
res_g += (g * coeff) >> 16;
|
||||||
res_b += (b * coeff) >> 16;
|
res_b += (b * coeff) >> 16;
|
||||||
}
|
}
|
||||||
|
|
||||||
output[w] = build_argb64(res_a, res_r, res_g, res_b);
|
output[w] = (
|
||||||
|
(uint64_t)res_a << 48) |
|
||||||
|
((uint64_t)res_r << 32) |
|
||||||
|
((uint64_t)res_g << 16) |
|
||||||
|
((uint64_t)res_b << 0);
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user