From 5b9a17dc8fe3d9b15d62642774cc5156f97298b7 Mon Sep 17 00:00:00 2001
From: twinaphex <libretro@gmail.com>
Date: Sun, 16 Apr 2017 19:54:38 +0200
Subject: [PATCH] (gfx/scaler) Cleanups

---
 libretro-common/gfx/scaler/pixconv.c       | 189 ++++++++++-----------
 libretro-common/gfx/scaler/scaler_filter.c |  10 +-
 libretro-common/gfx/scaler/scaler_int.c    | 129 +++++++-------
 3 files changed, 171 insertions(+), 157 deletions(-)

diff --git a/libretro-common/gfx/scaler/pixconv.c b/libretro-common/gfx/scaler/pixconv.c
index 09cf34cefb..7097952f82 100644
--- a/libretro-common/gfx/scaler/pixconv.c
+++ b/libretro-common/gfx/scaler/pixconv.c
@@ -46,8 +46,7 @@ void conv_rgb565_0rgb1555(void *output_, const void *input_,
    uint16_t *output = (uint16_t*)output_;
 
 #if defined(__SSE2_)
-   int max_width = width - 7;
-
+   int max_width           = width - 7;
    const __m128i hi_mask   = _mm_set1_epi16(0x7fe0);
    const __m128i lo_mask   = _mm_set1_epi16(0x1f);
 #endif
@@ -115,7 +114,7 @@ void conv_0rgb1555_rgb565(void *output_, const void *input_,
          uint16_t rg   = (col << 1) & ((0x1f << 11) | (0x1f << 6));
          uint16_t b    = col & 0x1f;
          uint16_t glow = (col >> 4) & (1 << 5);
-         output[w] = rg | b | glow;
+         output[w]     = rg | b | glow;
       }
    }
 }
@@ -175,14 +174,14 @@ void conv_0rgb1555_argb8888(void *output_, const void *input_,
       for (; w < width; w++)
       {
          uint32_t col = input[w];
-         uint32_t r = (col >> 10) & 0x1f;
-         uint32_t g = (col >>  5) & 0x1f;
-         uint32_t b = (col >>  0) & 0x1f;
-         r = (r << 3) | (r >> 2);
-         g = (g << 3) | (g >> 2);
-         b = (b << 3) | (b >> 2);
+         uint32_t r   = (col >> 10) & 0x1f;
+         uint32_t g   = (col >>  5) & 0x1f;
+         uint32_t b   = (col >>  0) & 0x1f;
+         r            = (r << 3) | (r >> 2);
+         g            = (g << 3) | (g >> 2);
+         b            = (b << 3) | (b >> 2);
 
-         output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
+         output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
       }
    }
 }
@@ -217,22 +216,22 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
          __m128i res_lo, res_hi;
          __m128i res_lo_bg, res_hi_bg, res_lo_ra, res_hi_ra;
          const __m128i in = _mm_loadu_si128((const __m128i*)(input + w));
-         __m128i r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
-         __m128i g = _mm_and_si128(in, pix_mask_g);
-         __m128i b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
+         __m128i        r = _mm_and_si128(_mm_srli_epi16(in, 1), pix_mask_r);
+         __m128i        g = _mm_and_si128(in, pix_mask_g);
+         __m128i        b = _mm_and_si128(_mm_slli_epi16(in, 5), pix_mask_b);
 
-         r = _mm_mulhi_epi16(r, mul16_r);
-         g = _mm_mulhi_epi16(g, mul16_g);
-         b = _mm_mulhi_epi16(b, mul16_b);
+         r                = _mm_mulhi_epi16(r, mul16_r);
+         g                = _mm_mulhi_epi16(g, mul16_g);
+         b                = _mm_mulhi_epi16(b, mul16_b);
 
-         res_lo_bg = _mm_unpacklo_epi8(b, g);
-         res_hi_bg = _mm_unpackhi_epi8(b, g);
-         res_lo_ra = _mm_unpacklo_epi8(r, a);
-         res_hi_ra = _mm_unpackhi_epi8(r, a);
+         res_lo_bg        = _mm_unpacklo_epi8(b, g);
+         res_hi_bg        = _mm_unpackhi_epi8(b, g);
+         res_lo_ra        = _mm_unpacklo_epi8(r, a);
+         res_hi_ra        = _mm_unpackhi_epi8(r, a);
 
-         res_lo = _mm_or_si128(res_lo_bg,
+         res_lo           = _mm_or_si128(res_lo_bg,
                _mm_slli_si128(res_lo_ra, 2));
-         res_hi = _mm_or_si128(res_hi_bg,
+         res_hi           = _mm_or_si128(res_hi_bg,
                _mm_slli_si128(res_hi_ra, 2));
 
          _mm_storeu_si128((__m128i*)(output + w + 0), res_lo);
@@ -243,14 +242,14 @@ void conv_rgb565_argb8888(void *output_, const void *input_,
       for (; w < width; w++)
       {
          uint32_t col = input[w];
-         uint32_t r = (col >> 11) & 0x1f;
-         uint32_t g = (col >>  5) & 0x3f;
-         uint32_t b = (col >>  0) & 0x1f;
-         r = (r << 3) | (r >> 2);
-         g = (g << 2) | (g >> 4);
-         b = (b << 3) | (b >> 2);
+         uint32_t r   = (col >> 11) & 0x1f;
+         uint32_t g   = (col >>  5) & 0x3f;
+         uint32_t b   = (col >>  0) & 0x1f;
+         r            = (r << 3) | (r >> 2);
+         g            = (g << 2) | (g >> 4);
+         b            = (b << 3) | (b >> 2);
 
-         output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
+         output[w]    = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
       }
    }
 }
@@ -269,16 +268,16 @@ void conv_argb8888_rgba4444(void *output_, const void *input_,
       for (w = 0; w < width; w++)
       {
          uint32_t col = input[w];
-         uint32_t r = (col >> 16) & 0xf;
-         uint32_t g = (col >>  8) & 0xf;
-         uint32_t b = (col) & 0xf;
-         uint32_t a = (col >>  24) & 0xf;
-         r = (r >> 4) | r;
-         g = (g >> 4) | g;
-         b = (b >> 4) | b;
-         a = (a >> 4) | a;
+         uint32_t r   = (col >> 16) & 0xf;
+         uint32_t g   = (col >>  8) & 0xf;
+         uint32_t b   = (col) & 0xf;
+         uint32_t a   = (col >>  24) & 0xf;
+         r            = (r >> 4) | r;
+         g            = (g >> 4) | g;
+         b            = (b >> 4) | b;
+         a            = (a >> 4) | a;
 
-         output[w] = (r << 12) | (g << 8) | (b << 4) | a;
+         output[w]    = (r << 12) | (g << 8) | (b << 4) | a;
       }
    }
 }
@@ -297,16 +296,16 @@ void conv_rgba4444_argb8888(void *output_, const void *input_,
       for (w = 0; w < width; w++)
       {
          uint32_t col = input[w];
-         uint32_t r = (col >> 12) & 0xf;
-         uint32_t g = (col >>  8) & 0xf;
-         uint32_t b = (col >>  4) & 0xf;
-         uint32_t a = (col >>  0) & 0xf;
-         r = (r << 4) | r;
-         g = (g << 4) | g;
-         b = (b << 4) | b;
-         a = (a << 4) | a;
+         uint32_t r   = (col >> 12) & 0xf;
+         uint32_t g   = (col >>  8) & 0xf;
+         uint32_t b   = (col >>  4) & 0xf;
+         uint32_t a   = (col >>  0) & 0xf;
+         r            = (r << 4) | r;
+         g            = (g << 4) | g;
+         b            = (b << 4) | b;
+         a            = (a << 4) | a;
 
-         output[w] = (a << 24) | (r << 16) | (g << 8) | (b << 0);
+         output[w]    = (a << 24) | (r << 16) | (g << 8) | (b << 0);
       }
    }
 }
@@ -329,7 +328,7 @@ void conv_rgba4444_rgb565(void *output_, const void *input_,
          uint32_t g   = (col >>  8) & 0xf;
          uint32_t b   = (col >>  4) & 0xf;
 
-         output[w] = (r << 12) | (g << 7) | (b << 1);
+         output[w]    = (r << 12) | (g << 7) | (b << 1);
       }
    }
 }
@@ -420,32 +419,32 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
          __m128i b0        = _mm_and_si128(_mm_slli_epi16(in0, 5), pix_mask_gb);
          __m128i b1        = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_gb);
 
-         r0 = _mm_mulhi_epi16(r0, mul15_hi);
-         r1 = _mm_mulhi_epi16(r1, mul15_hi);
-         g0 = _mm_mulhi_epi16(g0, mul15_mid);
-         g1 = _mm_mulhi_epi16(g1, mul15_mid);
-         b0 = _mm_mulhi_epi16(b0, mul15_mid);
-         b1 = _mm_mulhi_epi16(b1, mul15_mid);
+         r0                = _mm_mulhi_epi16(r0, mul15_hi);
+         r1                = _mm_mulhi_epi16(r1, mul15_hi);
+         g0                = _mm_mulhi_epi16(g0, mul15_mid);
+         g1                = _mm_mulhi_epi16(g1, mul15_mid);
+         b0                = _mm_mulhi_epi16(b0, mul15_mid);
+         b1                = _mm_mulhi_epi16(b1, mul15_mid);
 
-         res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
-         res_lo_bg1 = _mm_unpacklo_epi8(b1, g1);
-         res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
-         res_hi_bg1 = _mm_unpackhi_epi8(b1, g1);
-         res_lo_ra0 = _mm_unpacklo_epi8(r0, a);
-         res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
-         res_hi_ra0 = _mm_unpackhi_epi8(r0, a);
-         res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
+         res_lo_bg0        = _mm_unpacklo_epi8(b0, g0);
+         res_lo_bg1        = _mm_unpacklo_epi8(b1, g1);
+         res_hi_bg0        = _mm_unpackhi_epi8(b0, g0);
+         res_hi_bg1        = _mm_unpackhi_epi8(b1, g1);
+         res_lo_ra0        = _mm_unpacklo_epi8(r0, a);
+         res_lo_ra1        = _mm_unpacklo_epi8(r1, a);
+         res_hi_ra0        = _mm_unpackhi_epi8(r0, a);
+         res_hi_ra1        = _mm_unpackhi_epi8(r1, a);
 
-         res_lo0 = _mm_or_si128(res_lo_bg0,
+         res_lo0           = _mm_or_si128(res_lo_bg0,
                _mm_slli_si128(res_lo_ra0, 2));
-         res_lo1 = _mm_or_si128(res_lo_bg1,
+         res_lo1           = _mm_or_si128(res_lo_bg1,
                _mm_slli_si128(res_lo_ra1, 2));
-         res_hi0 = _mm_or_si128(res_hi_bg0,
+         res_hi0           = _mm_or_si128(res_hi_bg0,
                _mm_slli_si128(res_hi_ra0, 2));
-         res_hi1 = _mm_or_si128(res_hi_bg1,
+         res_hi1           = _mm_or_si128(res_hi_bg1,
                _mm_slli_si128(res_hi_ra1, 2));
 
-         /* Non-POT pixel sizes ftl :( */
+         /* Non-POT pixel sizes for the loss */
          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
       }
 #endif
@@ -456,13 +455,13 @@ void conv_0rgb1555_bgr24(void *output_, const void *input_,
          uint32_t b   = (col >>  0) & 0x1f;
          uint32_t g   = (col >>  5) & 0x1f;
          uint32_t r   = (col >> 10) & 0x1f;
-         b = (b << 3) | (b >> 2);
-         g = (g << 3) | (g >> 2);
-         r = (r << 3) | (r >> 2);
+         b            = (b << 3) | (b >> 2);
+         g            = (g << 3) | (g >> 2);
+         r            = (r << 3) | (r >> 2);
 
-         *out++ = b;
-         *out++ = g;
-         *out++ = r;
+         *out++       = b;
+         *out++       = g;
+         *out++       = r;
       }
    }
 }
@@ -506,12 +505,12 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
          __m128i g1 = _mm_and_si128(in1, pix_mask_g);
          __m128i b1 = _mm_and_si128(_mm_slli_epi16(in1, 5), pix_mask_b);
 
-         r0 = _mm_mulhi_epi16(r0, mul16_r);
-         g0 = _mm_mulhi_epi16(g0, mul16_g);
-         b0 = _mm_mulhi_epi16(b0, mul16_b);
-         r1 = _mm_mulhi_epi16(r1, mul16_r);
-         g1 = _mm_mulhi_epi16(g1, mul16_g);
-         b1 = _mm_mulhi_epi16(b1, mul16_b);
+         r0         = _mm_mulhi_epi16(r0, mul16_r);
+         g0         = _mm_mulhi_epi16(g0, mul16_g);
+         b0         = _mm_mulhi_epi16(b0, mul16_b);
+         r1         = _mm_mulhi_epi16(r1, mul16_r);
+         g1         = _mm_mulhi_epi16(g1, mul16_g);
+         b1         = _mm_mulhi_epi16(b1, mul16_b);
 
          res_lo_bg0 = _mm_unpacklo_epi8(b0, g0);
          res_hi_bg0 = _mm_unpackhi_epi8(b0, g0);
@@ -522,13 +521,13 @@ void conv_rgb565_bgr24(void *output_, const void *input_,
          res_lo_ra1 = _mm_unpacklo_epi8(r1, a);
          res_hi_ra1 = _mm_unpackhi_epi8(r1, a);
 
-         res_lo0 = _mm_or_si128(res_lo_bg0,
+         res_lo0    = _mm_or_si128(res_lo_bg0,
                _mm_slli_si128(res_lo_ra0, 2));
-         res_hi0 = _mm_or_si128(res_hi_bg0,
+         res_hi0    = _mm_or_si128(res_hi_bg0,
                _mm_slli_si128(res_hi_ra0, 2));
-         res_lo1 = _mm_or_si128(res_lo_bg1,
+         res_lo1    = _mm_or_si128(res_lo_bg1,
                _mm_slli_si128(res_lo_ra1, 2));
-         res_hi1 = _mm_or_si128(res_hi_bg1,
+         res_hi1    = _mm_or_si128(res_hi_bg1,
                _mm_slli_si128(res_hi_ra1, 2));
 
          store_bgr24_sse2(out, res_lo0, res_hi0, res_lo1, res_hi1);
@@ -569,7 +568,7 @@ void conv_bgr24_argb8888(void *output_, const void *input_,
          uint32_t b = *inp++;
          uint32_t g = *inp++;
          uint32_t r = *inp++;
-         output[w] = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
+         output[w]  = (0xffu << 24) | (r << 16) | (g << 8) | (b << 0);
       }
    }
 }
@@ -588,10 +587,10 @@ void conv_argb8888_0rgb1555(void *output_, const void *input_,
       for (w = 0; w < width; w++)
       {
          uint32_t col = input[w];
-         uint16_t r = (col >> 19) & 0x1f;
-         uint16_t g = (col >> 11) & 0x1f;
-         uint16_t b = (col >>  3) & 0x1f;
-         output[w] = (r << 10) | (g << 5) | (b << 0);
+         uint16_t r   = (col >> 19) & 0x1f;
+         uint16_t g   = (col >> 11) & 0x1f;
+         uint16_t b   = (col >>  3) & 0x1f;
+         output[w]    = (r << 10) | (g << 5) | (b << 0);
       }
    }
 }
@@ -627,9 +626,9 @@ void conv_argb8888_bgr24(void *output_, const void *input_,
       for (; w < width; w++)
       {
          uint32_t col = input[w];
-         *out++ = (uint8_t)(col >>  0);
-         *out++ = (uint8_t)(col >>  8);
-         *out++ = (uint8_t)(col >> 16);
+         *out++       = (uint8_t)(col >>  0);
+         *out++       = (uint8_t)(col >>  8);
+         *out++       = (uint8_t)(col >> 16);
       }
    }
 }
@@ -648,7 +647,7 @@ void conv_argb8888_abgr8888(void *output_, const void *input_,
       for (w = 0; w < width; w++)
       {
          uint32_t col = input[w];
-         output[w] = ((col << 16) & 0xff0000) | 
+         output[w]    = ((col << 16) & 0xff0000) | 
             ((col >> 16) & 0xff) | (col & 0xff00ff00);
       }
    }
@@ -793,8 +792,8 @@ void conv_yuyv_argb8888(void *output_, const void *input_,
          uint8_t g1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_G * u + YUV_MAT_V_G * v + YUV_OFFSET) >> YUV_SHIFT);
          uint8_t b1 = clamp_8bit((YUV_MAT_Y * _y1 + YUV_MAT_U_B * u                   + YUV_OFFSET) >> YUV_SHIFT);
 
-         dst[0] = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
-         dst[1] = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
+         dst[0]     = 0xff000000u | (r0 << 16) | (g0 << 8) | (b0 << 0);
+         dst[1]     = 0xff000000u | (r1 << 16) | (g1 << 8) | (b1 << 0);
       }
    }
 }
@@ -809,7 +808,7 @@ void conv_copy(void *output_, const void *input_,
    uint8_t *output      = (uint8_t*)output_;
 
    if (abs(in_stride) < copy_len)
-      copy_len = abs(in_stride);
+      copy_len          = abs(in_stride);
 
    for (h = 0; h < height;
          h++, output += out_stride, input += in_stride)
diff --git a/libretro-common/gfx/scaler/scaler_filter.c b/libretro-common/gfx/scaler/scaler_filter.c
index 48bc23a2a3..56d14304bc 100644
--- a/libretro-common/gfx/scaler/scaler_filter.c
+++ b/libretro-common/gfx/scaler/scaler_filter.c
@@ -198,8 +198,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
          x_pos  = (1 << 15) * ctx->in_width / ctx->out_width   - (1 << 15);
          y_pos  = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
 
-         gen_filter_point_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
-         gen_filter_point_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
+         gen_filter_point_sub(&ctx->horiz, ctx->out_width,  x_pos, x_step);
+         gen_filter_point_sub(&ctx->vert,  ctx->out_height, y_pos, y_step);
 
          ctx->scaler_special = scaler_argb8888_point_special;
          break;
@@ -208,8 +208,8 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
          x_pos  = (1 << 15) * ctx->in_width / ctx->out_width   - (1 << 15);
          y_pos  = (1 << 15) * ctx->in_height / ctx->out_height - (1 << 15);
 
-         gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width, x_pos, x_step);
-         gen_filter_bilinear_sub(&ctx->vert, ctx->out_height, y_pos, y_step);
+         gen_filter_bilinear_sub(&ctx->horiz, ctx->out_width,  x_pos, x_step);
+         gen_filter_bilinear_sub(&ctx->vert,  ctx->out_height, y_pos, y_step);
          break;
 
       case SCALER_TYPE_SINC:
@@ -231,7 +231,7 @@ bool scaler_gen_filter(struct scaler_ctx *ctx)
 
    /* Makes sure that we never sample outside our rectangle. */
    fixup_filter_sub(&ctx->horiz, ctx->out_width, ctx->in_width);
-   fixup_filter_sub(&ctx->vert, ctx->out_height, ctx->in_height);
+   fixup_filter_sub(&ctx->vert,  ctx->out_height, ctx->in_height);
 
    return validate_filter(ctx);
 }
diff --git a/libretro-common/gfx/scaler/scaler_int.c b/libretro-common/gfx/scaler/scaler_int.c
index 2b189c6dd1..cc96042834 100644
--- a/libretro-common/gfx/scaler/scaler_int.c
+++ b/libretro-common/gfx/scaler/scaler_int.c
@@ -38,19 +38,28 @@
 /* ARGB8888 scaler is split in two:
  *
  * First, horizontal scaler is applied.
- * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 to left to occupy 15 bits.
- * The sign bit is kept empty as we have to do signed multiplication for the filter.
- * A mulhi [(a * b) >> 16] is applied which loses some precision, but is very efficient for SIMD.
+ * Here, all 8-bit channels are expanded to 16-bit. Values are then shifted 7 
+ * to left to occupy 15 bits.
+ *
+ * The sign bit is kept empty as we have to do signed multiplication for the 
+ * filter.
+ *
+ * A mulhi [(a * b) >> 16] is applied which loses some precision, but is 
+ * very efficient for SIMD.
  * It is accurate enough for 8-bit purposes.
  *
- * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, the output is kept
- * with 16-bit channels, and will now have 13 bits of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
+ * The fixed point 1.0 for filter is (1 << 14). After horizontal scale, 
+ * the output is kept with 16-bit channels, and will now have 13 bits 
+ * of precision as [(a * (1 << 14)) >> 16] is effectively a right shift by 2.
  *
- * Vertical scaler takes the 13 bit channels, and performs the same mulhi steps.
+ * Vertical scaler takes the 13 bit channels, and performs the 
+ * same mulhi steps.
  * Another 2 bits of precision is lost, which ends up as 11 bits.
- * Scaling is now complete. Channels are shifted right by 3, and saturated into 8-bit values.
+ * Scaling is now complete. Channels are shifted right by 3, and saturated 
+ * into 8-bit values.
  *
- * The C version of scalers perform the exact same operations as the SIMD code for testing purposes.
+ * The C version of scalers perform the exact same operations as the 
+ * SIMD code for testing purposes.
  */
 
 void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int stride)
@@ -61,9 +70,11 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
 
    const int16_t *filter_vert = ctx->vert.filter;
 
-   for (h = 0; h < ctx->out_height; h++, filter_vert += ctx->vert.filter_stride, output += stride >> 2)
+   for (h = 0; h < ctx->out_height; h++, 
+         filter_vert += ctx->vert.filter_stride, output += stride >> 2)
    {
-      const uint64_t *input_base = input + ctx->vert.filter_pos[h] * (ctx->scaled.stride >> 3);
+      const uint64_t *input_base = input + ctx->vert.filter_pos[h] 
+         * (ctx->scaled.stride >> 3);
 
       for (w = 0; w < ctx->out_width; w++)
       {
@@ -72,12 +83,13 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
          __m128i final;
          __m128i res = _mm_setzero_si128();
 
-         for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2, input_base_y += (ctx->scaled.stride >> 2))
+         for (y = 0; (y + 1) < ctx->vert.filter_len; y += 2,
+               input_base_y += (ctx->scaled.stride >> 2))
          {
             __m128i coeff = _mm_set_epi64x(filter_vert[y + 1] * 0x0001000100010001ll, filter_vert[y + 0] * 0x0001000100010001ll);
             __m128i col   = _mm_set_epi64x(input_base_y[ctx->scaled.stride >> 3], input_base_y[0]);
 
-            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
          }
 
          for (; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
@@ -85,7 +97,7 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
             __m128i coeff = _mm_set_epi64x(0, filter_vert[y] * 0x0001000100010001ll);
             __m128i col   = _mm_set_epi64x(0, input_base_y[0]);
 
-            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
          }
 
          res       = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
@@ -100,53 +112,52 @@ void scaler_argb8888_vert(const struct scaler_ctx *ctx, void *output_, int strid
          int16_t res_g = 0;
          int16_t res_b = 0;
 
-         for (y = 0; y < ctx->vert.filter_len; y++, input_base_y += (ctx->scaled.stride >> 3))
+         for (y = 0; y < ctx->vert.filter_len; y++,
+               input_base_y += (ctx->scaled.stride >> 3))
          {
-            uint64_t col = *input_base_y;
+            uint64_t col   = *input_base_y;
 
-            int16_t a = (col >> 48) & 0xffff;
-            int16_t r = (col >> 32) & 0xffff;
-            int16_t g = (col >> 16) & 0xffff;
-            int16_t b = (col >>  0) & 0xffff;
+            int16_t a      = (col >> 48) & 0xffff;
+            int16_t r      = (col >> 32) & 0xffff;
+            int16_t g      = (col >> 16) & 0xffff;
+            int16_t b      = (col >>  0) & 0xffff;
 
-            int16_t coeff = filter_vert[y];
+            int16_t coeff  = filter_vert[y];
 
-            res_a += (a * coeff) >> 16;
-            res_r += (r * coeff) >> 16;
-            res_g += (g * coeff) >> 16;
-            res_b += (b * coeff) >> 16;
+            res_a         += (a * coeff) >> 16;
+            res_r         += (r * coeff) >> 16;
+            res_g         += (g * coeff) >> 16;
+            res_b         += (b * coeff) >> 16;
          }
 
-         res_a >>= (7 - 2 - 2);
-         res_r >>= (7 - 2 - 2);
-         res_g >>= (7 - 2 - 2);
-         res_b >>= (7 - 2 - 2);
+         res_a           >>= (7 - 2 - 2);
+         res_r           >>= (7 - 2 - 2);
+         res_g           >>= (7 - 2 - 2);
+         res_b           >>= (7 - 2 - 2);
 
-         output[w] = (clamp_8bit(res_a) << 24) | (clamp_8bit(res_r) << 16) | 
-            (clamp_8bit(res_g) << 8) | (clamp_8bit(res_b) << 0);
+         output[w]         = 
+            (clamp_8bit(res_a) << 24) |
+            (clamp_8bit(res_r) << 16) | 
+            (clamp_8bit(res_g) << 8)  |
+            (clamp_8bit(res_b) << 0);
 #endif
       }
    }
 }
 
-#if !defined(__SSE2__)
-static INLINE uint64_t build_argb64(uint16_t a, uint16_t r, uint16_t g, uint16_t b)
-{
-   return ((uint64_t)a << 48) | ((uint64_t)r << 32) | ((uint64_t)g << 16) | ((uint64_t)b << 0);
-}
-#endif
-
 void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int stride)
 {
    int h, w, x;
    const uint32_t *input = (uint32_t*)input_;
    uint64_t *output      = ctx->scaled.frame;
 
-   for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2, output += ctx->scaled.stride >> 3)
+   for (h = 0; h < ctx->scaled.height; h++, input += stride >> 2,
+         output += ctx->scaled.stride >> 3)
    {
       const int16_t *filter_horiz = ctx->horiz.filter;
 
-      for (w = 0; w < ctx->scaled.width; w++, filter_horiz += ctx->horiz.filter_stride)
+      for (w = 0; w < ctx->scaled.width; w++,
+            filter_horiz += ctx->horiz.filter_stride)
       {
          const uint32_t *input_base_x = input + ctx->horiz.filter_pos[w];
 #if defined(__SSE2__)
@@ -156,11 +167,11 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
          {
             __m128i coeff = _mm_set_epi64x(filter_horiz[x + 1] * 0x0001000100010001ll, filter_horiz[x + 0] * 0x0001000100010001ll);
 
-            __m128i col = _mm_unpacklo_epi8(_mm_set_epi64x(0,
+            __m128i col   = _mm_unpacklo_epi8(_mm_set_epi64x(0,
                      ((uint64_t)input_base_x[x + 1] << 32) | input_base_x[x + 0]), _mm_setzero_si128());
 
-            col = _mm_slli_epi16(col, 7);
-            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+            col           = _mm_slli_epi16(col, 7);
+            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
          }
 
          for (; x < ctx->horiz.filter_len; x++)
@@ -168,14 +179,14 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
             __m128i coeff = _mm_set_epi64x(0, filter_horiz[x] * 0x0001000100010001ll);
             __m128i col   = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, 0, input_base_x[x]), _mm_setzero_si128());
 
-            col = _mm_slli_epi16(col, 7);
-            res = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
+            col           = _mm_slli_epi16(col, 7);
+            res           = _mm_adds_epi16(_mm_mulhi_epi16(col, coeff), res);
          }
 
-         res       = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
+         res              = _mm_adds_epi16(_mm_srli_si128(res, 8), res);
 
 #ifdef __x86_64__
-         output[w] = _mm_cvtsi128_si64(res);
+         output[w]        = _mm_cvtsi128_si64(res);
 #else /* 32-bit doesn't have si64. Do it in two steps. */
          union
          {
@@ -194,22 +205,26 @@ void scaler_argb8888_horiz(const struct scaler_ctx *ctx, const void *input_, int
 
          for (x = 0; x < ctx->horiz.filter_len; x++)
          {
-            uint32_t col = input_base_x[x];
+            uint32_t col   = input_base_x[x];
 
-            int16_t a = (col >> (24 - 7)) & (0xff << 7);
-            int16_t r = (col >> (16 - 7)) & (0xff << 7);
-            int16_t g = (col >> ( 8 - 7)) & (0xff << 7);
-            int16_t b = (col << ( 0 + 7)) & (0xff << 7);
+            int16_t a      = (col >> (24 - 7)) & (0xff << 7);
+            int16_t r      = (col >> (16 - 7)) & (0xff << 7);
+            int16_t g      = (col >> ( 8 - 7)) & (0xff << 7);
+            int16_t b      = (col << ( 0 + 7)) & (0xff << 7);
 
-            int16_t coeff = filter_horiz[x];
+            int16_t coeff  = filter_horiz[x];
 
-            res_a += (a * coeff) >> 16;
-            res_r += (r * coeff) >> 16;
-            res_g += (g * coeff) >> 16;
-            res_b += (b * coeff) >> 16;
+            res_a         += (a * coeff) >> 16;
+            res_r         += (r * coeff) >> 16;
+            res_g         += (g * coeff) >> 16;
+            res_b         += (b * coeff) >> 16;
          }
 
-         output[w] = build_argb64(res_a, res_r, res_g, res_b);
+         output[w]         = (
+               (uint64_t)res_a  << 48)  | 
+               ((uint64_t)res_r << 32)  |
+               ((uint64_t)res_g << 16)  |
+               ((uint64_t)res_b << 0);
 #endif
       }
    }