330 lines
7.3 KiB
C
Raw Normal View History

2013-11-19 11:30:58 +01:00
#pragma once
2014-10-07 17:35:44 +04:00
#include <emmintrin.h>
2015-07-26 03:53:26 +03:00
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define thread_local __declspec(thread)
#elif __APPLE__
#define thread_local __thread
#endif
#if defined(_MSC_VER)
#define never_inline __declspec(noinline)
#else
#define never_inline __attribute__((noinline))
#endif
#if defined(_MSC_VER)
#define safe_buffers __declspec(safebuffers)
2014-07-14 23:15:30 +04:00
#else
#define safe_buffers
2014-07-14 23:15:30 +04:00
#endif
#if defined(_MSC_VER)
#define force_inline __forceinline
#else
#define force_inline __attribute__((always_inline))
#endif
#if defined(_MSC_VER) && _MSC_VER <= 1800
#define alignas(x) _CRT_ALIGN(x)
#endif
2013-11-19 11:30:58 +01:00
#if defined(__GNUG__)
#include <stdlib.h>
#include <cstdint>
#ifndef __APPLE__
#include <malloc.h>
#endif
#define _fpclass(x) std::fpclassify(x)
2013-11-19 11:30:58 +01:00
#define _byteswap_ushort(x) __builtin_bswap16(x)
#define _byteswap_ulong(x) __builtin_bswap32(x)
#define _byteswap_uint64(x) __builtin_bswap64(x)
#define INFINITE 0xFFFFFFFF
inline uint64_t __umulh(uint64_t a, uint64_t b)
{
uint64_t result;
__asm__("mulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
inline int64_t __mulh(int64_t a, int64_t b)
{
int64_t result;
__asm__("imulq %[b]" : "=d" (result) : [a] "a" (a), [b] "rm" (b));
return result;
}
2014-07-12 17:02:39 +10:00
#ifdef __APPLE__
// XXX only supports a single timer
#define TIMER_ABSTIME -1
/* The opengroup spec isn't clear on the mapping from REALTIME to CALENDAR
being appropriate or not.
http://pubs.opengroup.org/onlinepubs/009695299/basedefs/time.h.html */
#define CLOCK_REALTIME 1 // #define CALENDAR_CLOCK 1 from mach/clock_types.h
#define CLOCK_MONOTONIC 0 // #define SYSTEM_CLOCK 0
typedef int clockid_t;
/* the mach kernel uses struct mach_timespec, so struct timespec
is loaded from <sys/_types/_timespec.h> for compatability */
// struct timespec { time_t tv_sec; long tv_nsec; };
int clock_gettime(clockid_t clk_id, struct timespec *tp);
2014-07-12 17:02:39 +10:00
#endif /* __APPLE__ */
2015-08-06 23:20:48 +03:00
#endif /* __GNUG__ */
#if defined(_MSC_VER)
2015-08-21 14:07:31 +03:00
// Unsigned 128-bit integer implementation
2015-08-06 23:20:48 +03:00
struct alignas(16) uint128_t
{
uint64_t lo, hi;
2015-09-13 01:37:57 +03:00
uint128_t() = default;
uint128_t(uint64_t l)
: lo(l)
, hi(0)
{
}
[[deprecated("Not implemented")]] inline uint128_t operator +(const uint128_t& r) const
{
return{};
}
inline uint128_t operator +(uint64_t r) const
{
uint128_t value;
value.lo = lo + r;
value.hi = value.lo < r ? hi + 1 : hi;
return value;
}
[[deprecated("Not implemented")]] inline uint128_t operator -(const uint128_t& r) const
{
return{};
}
inline uint128_t operator -(uint64_t r) const
{
uint128_t value;
value.lo = lo - r;
value.hi = lo < r ? hi - 1 : hi;
return value;
}
inline uint128_t operator +() const
2015-08-06 23:20:48 +03:00
{
return *this;
}
2015-09-13 01:37:57 +03:00
inline uint128_t operator -() const
2015-08-06 23:20:48 +03:00
{
2015-09-13 01:37:57 +03:00
uint128_t value;
value.lo = ~lo + 1;
value.hi = lo ? ~hi : ~hi + 1;
return value;
}
inline uint128_t& operator ++()
{
if (!++lo) ++hi;
2015-08-06 23:20:48 +03:00
return *this;
}
2015-09-13 01:37:57 +03:00
inline uint128_t operator ++(int)
2015-08-06 23:20:48 +03:00
{
uint128_t value = *this;
if (!++lo) ++hi;
return value;
}
2015-09-13 01:37:57 +03:00
inline uint128_t& operator --()
{
if (!lo--) hi--;
return *this;
}
inline uint128_t operator --(int)
2015-08-06 23:20:48 +03:00
{
uint128_t value = *this;
if (!lo--) hi--;
return value;
}
2015-09-13 01:37:57 +03:00
inline uint128_t operator ~() const
{
uint128_t value;
value.lo = ~lo;
value.hi = ~hi;
return value;
}
inline uint128_t operator &(const uint128_t& r) const
{
uint128_t value;
value.lo = lo & r.lo;
value.hi = hi & r.hi;
return value;
}
inline uint128_t operator |(const uint128_t& r) const
{
uint128_t value;
value.lo = lo | r.lo;
value.hi = hi | r.hi;
return value;
}
inline uint128_t operator ^(const uint128_t& r) const
{
uint128_t value;
value.lo = lo ^ r.lo;
value.hi = hi ^ r.hi;
return value;
}
[[deprecated("Not implemented")]] inline uint128_t& operator +=(const uint128_t& r)
{
return *this;
}
inline uint128_t& operator +=(uint64_t r)
{
hi = (lo += r) < r ? hi + 1 : hi;
return *this;
}
[[deprecated("Not implemented")]] inline uint128_t& operator -=(const uint128_t& r)
{
return *this;
}
inline uint128_t& operator &=(const uint128_t& r)
{
lo &= r.lo;
hi &= r.hi;
return *this;
}
inline uint128_t& operator |=(const uint128_t& r)
{
lo |= r.lo;
hi |= r.hi;
return *this;
}
inline uint128_t& operator ^=(const uint128_t& r)
{
lo ^= r.lo;
hi ^= r.hi;
return *this;
}
2015-08-06 23:20:48 +03:00
};
using __uint128_t = uint128_t;
#endif
inline uint32_t cntlz32(uint32_t arg)
2014-09-27 22:49:33 +04:00
{
2015-01-13 00:32:53 +03:00
#if defined(_MSC_VER)
2014-09-27 22:49:33 +04:00
unsigned long res;
if (!_BitScanReverse(&res, arg))
{
return 32;
}
else
{
return res ^ 31;
}
2015-01-13 00:32:53 +03:00
#else
if (arg)
{
return __builtin_clzll((uint64_t)arg) - 32;
}
else
{
return 32;
}
2014-09-27 22:49:33 +04:00
#endif
}
inline uint64_t cntlz64(uint64_t arg)
2014-09-27 22:49:33 +04:00
{
2015-01-13 00:32:53 +03:00
#if defined(_MSC_VER)
2014-09-27 22:49:33 +04:00
unsigned long res;
if (!_BitScanReverse64(&res, arg))
{
return 64;
}
else
{
return res ^ 63;
}
2015-01-13 00:32:53 +03:00
#else
if (arg)
{
return __builtin_clzll(arg);
}
else
{
return 64;
}
2014-09-27 22:49:33 +04:00
#endif
}
2014-10-07 17:35:44 +04:00
2014-10-08 18:26:08 +04:00
// compare 16 packed unsigned bytes (greater than)
2015-03-26 21:42:12 +03:00
inline __m128i sse_cmpgt_epu8(__m128i A, __m128i B)
2014-10-07 17:35:44 +04:00
{
// (A xor 0x80) > (B xor 0x80)
2015-03-26 21:42:12 +03:00
const auto sign = _mm_set1_epi32(0x80808080);
return _mm_cmpgt_epi8(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
2014-10-07 17:35:44 +04:00
}
2015-03-26 21:42:12 +03:00
inline __m128i sse_cmpgt_epu16(__m128i A, __m128i B)
2014-10-07 17:35:44 +04:00
{
2015-03-26 21:42:12 +03:00
const auto sign = _mm_set1_epi32(0x80008000);
return _mm_cmpgt_epi16(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
}
inline __m128i sse_cmpgt_epu32(__m128i A, __m128i B)
{
const auto sign = _mm_set1_epi32(0x80000000);
return _mm_cmpgt_epi32(_mm_xor_si128(A, sign), _mm_xor_si128(B, sign));
2014-10-07 17:35:44 +04:00
}
2015-03-29 14:00:10 +03:00
inline __m128 sse_exp2_ps(__m128 A)
{
const auto x0 = _mm_max_ps(_mm_min_ps(A, _mm_set1_ps(127.4999961f)), _mm_set1_ps(-127.4999961f));
const auto x1 = _mm_add_ps(x0, _mm_set1_ps(0.5f));
const auto x2 = _mm_sub_epi32(_mm_cvtps_epi32(x1), _mm_and_si128(_mm_castps_si128(_mm_cmpnlt_ps(_mm_setzero_ps(), x1)), _mm_set1_epi32(1)));
const auto x3 = _mm_sub_ps(x0, _mm_cvtepi32_ps(x2));
const auto x4 = _mm_mul_ps(x3, x3);
const auto x5 = _mm_mul_ps(x3, _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(x4, _mm_set1_ps(0.023093347705f)), _mm_set1_ps(20.20206567f)), x4), _mm_set1_ps(1513.906801f)));
const auto x6 = _mm_mul_ps(x5, _mm_rcp_ps(_mm_sub_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(233.1842117f), x4), _mm_set1_ps(4368.211667f)), x5)));
return _mm_mul_ps(_mm_add_ps(_mm_add_ps(x6, x6), _mm_set1_ps(1.0f)), _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(x2, _mm_set1_epi32(127)), 23)));
}
inline __m128 sse_log2_ps(__m128 A)
{
const auto _1 = _mm_set1_ps(1.0f);
const auto _c = _mm_set1_ps(1.442695040f);
const auto x0 = _mm_max_ps(A, _mm_castsi128_ps(_mm_set1_epi32(0x00800000)));
const auto x1 = _mm_or_ps(_mm_and_ps(x0, _mm_castsi128_ps(_mm_set1_epi32(0x807fffff))), _1);
const auto x2 = _mm_rcp_ps(_mm_add_ps(x1, _1));
const auto x3 = _mm_mul_ps(_mm_sub_ps(x1, _1), x2);
const auto x4 = _mm_add_ps(x3, x3);
const auto x5 = _mm_mul_ps(x4, x4);
const auto x6 = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-0.7895802789f), x5), _mm_set1_ps(16.38666457f)), x5), _mm_set1_ps(-64.1409953f));
const auto x7 = _mm_rcp_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_set1_ps(-35.67227983f), x5), _mm_set1_ps(312.0937664f)), x5), _mm_set1_ps(-769.6919436f)));
const auto x8 = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_castps_si128(x0), 23), _mm_set1_epi32(127)));
return _mm_add_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(_mm_mul_ps(x5, x6), x7), x4), _c), _mm_add_ps(_mm_mul_ps(x4, _c), x8));
}