mirror of
https://github.com/libretro/RetroArch
synced 2025-03-26 02:37:23 +00:00
Neat optimizations for SSE.
This commit is contained in:
parent
41cd6e21c3
commit
77a2723bb0
@ -34,21 +34,23 @@ static inline void audio_convert_float_to_s16_C(int16_t *out,
|
||||
static inline void audio_convert_s16_to_float_SSE2(float *out,
|
||||
const int16_t *in, unsigned samples)
|
||||
{
|
||||
// Not aligned? FML :(
|
||||
if (((uintptr_t)in & 7) || ((uintptr_t)out & 15))
|
||||
{
|
||||
audio_convert_s16_to_float_C(out, in, samples);
|
||||
return;
|
||||
}
|
||||
|
||||
__m128 factor = _mm_set1_ps(1.0f / 0x7fff);
|
||||
__m128 factor = _mm_set1_ps(1.0f / (0x7fff * 0x10000));
|
||||
unsigned i;
|
||||
for (i = 0; i + 4 <= samples; i += 4, in += 4, out += 4)
|
||||
for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
|
||||
{
|
||||
__m64 input = *(const __m64*)in;
|
||||
__m128 reg = _mm_cvtpi16_ps(input);
|
||||
__m128 res = _mm_mul_ps(reg, factor);
|
||||
_mm_store_ps(out, res);
|
||||
__m128i input = _mm_loadu_si128((const __m128i *)in);
|
||||
__m128i regs[2] = {
|
||||
_mm_unpacklo_epi16(_mm_setzero_si128(), input),
|
||||
_mm_unpackhi_epi16(_mm_setzero_si128(), input),
|
||||
};
|
||||
|
||||
__m128 output[2] = {
|
||||
_mm_mul_ps(_mm_cvtepi32_ps(regs[0]), factor),
|
||||
_mm_mul_ps(_mm_cvtepi32_ps(regs[1]), factor),
|
||||
};
|
||||
|
||||
_mm_storeu_ps(out + 0, output[0]);
|
||||
_mm_storeu_ps(out + 4, output[1]);
|
||||
}
|
||||
|
||||
audio_convert_s16_to_float_C(out, in, samples - i);
|
||||
@ -57,21 +59,17 @@ static inline void audio_convert_s16_to_float_SSE2(float *out,
|
||||
static inline void audio_convert_float_to_s16_SSE2(int16_t *out,
|
||||
const float *in, unsigned samples)
|
||||
{
|
||||
// Not aligned? FML :(
|
||||
if (((uintptr_t)in & 7) || ((uintptr_t)out & 15))
|
||||
{
|
||||
audio_convert_float_to_s16_C(out, in, samples);
|
||||
return;
|
||||
}
|
||||
|
||||
__m128 factor = _mm_set1_ps(0x7fff);
|
||||
__m128 factor = _mm_set1_ps((float)0x7fff);
|
||||
unsigned i;
|
||||
for (i = 0; i + 4 <= samples; i += 4, in += 4, out += 4)
|
||||
for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8)
|
||||
{
|
||||
__m128 input = _mm_load_ps(in);
|
||||
__m128 res = _mm_mul_ps(input, factor);
|
||||
__m64 output = _mm_cvtps_pi16(res);
|
||||
*(__m64*)out = output;
|
||||
__m128 input[2] = { _mm_loadu_ps(in + 0), _mm_loadu_ps(in + 4) };
|
||||
__m128 res[2] = { _mm_mul_ps(input[0], factor), _mm_mul_ps(input[1], factor) };
|
||||
|
||||
__m128i ints[2] = { _mm_cvtps_epi32(res[0]), _mm_cvtps_epi32(res[1]) };
|
||||
__m128i packed = _mm_packs_epi32(ints[0], ints[1]);
|
||||
|
||||
_mm_storeu_si128((__m128i *)out, packed);
|
||||
}
|
||||
|
||||
audio_convert_float_to_s16_C(out, in, samples - i);
|
||||
|
Loading…
x
Reference in New Issue
Block a user