From 56797b8841ab5a9a9d7a9ab82383567300361252 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Jos=C3=A9=20Garc=C3=ADa=20Garc=C3=ADa?= Date: Sun, 31 May 2020 17:18:53 +0200 Subject: [PATCH 1/3] Squashed 'deps/vitaGL/' changes from 9a6e4b3397..fb87308d15 fb87308d15 Added vglBindPackedAttribLocation. 7e933f6051 Added NO_DEBUG compile option. 28e8516718 Faster textures storing when format is the same as internal format. git-subtree-dir: deps/vitaGL git-subtree-split: fb87308d15a387d2549fb45d860b3d87ede8a0ca --- Makefile | 4 ++++ source/custom_shaders.c | 12 ++++++++---- source/textures.c | 9 ++++++--- source/utils/gpu_utils.c | 25 +++++++++++++++++-------- source/utils/gpu_utils.h | 2 +- source/vitaGL.h | 1 + 6 files changed, 37 insertions(+), 16 deletions(-) diff --git a/Makefile b/Makefile index f5efeb0449..e4b82ce5fa 100644 --- a/Makefile +++ b/Makefile @@ -17,6 +17,10 @@ AR = $(PREFIX)-gcc-ar CFLAGS = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize -DSTB_DXT_IMPLEMENTATION ASFLAGS = $(CFLAGS) +ifeq ($(NO_DEBUG),1) +CFLAGS += -DSKIP_ERROR_HANDLING +endif + all: $(TARGET).a $(TARGET).a: $(OBJS) diff --git a/source/custom_shaders.c b/source/custom_shaders.c index d3d5beeb34..7ff792b2e4 100644 --- a/source/custom_shaders.c +++ b/source/custom_shaders.c @@ -357,8 +357,7 @@ void glUniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, cons * ------------------------------ */ -// Equivalent of glBindAttribLocation but for sceGxm architecture -void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) { +void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset) { // Grabbing passed program program *p = &progs[prog - 1]; SceGxmVertexAttribute *attributes = &p->attr[index]; @@ -369,7 +368,7 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const // Setting stream index and offset values attributes->streamIndex = index; - attributes->offset = 0; + attributes->offset = offset; // Detecting attribute format and size int bpe; @@ -396,7 +395,12 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const p->attr_num = index + 1; } -// Equivalent of glVertexAttribLocation but for sceGxm architecture +// Equivalent of glBindAttribLocation but for sceGxm architecture +void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) { + vglBindPackedAttribLocation(prog, index, name, num, type, 0); +} + +// Equivalent of glVertexAttribPointer but for sceGxm architecture void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer) { #ifndef SKIP_ERROR_HANDLING // Error handling diff --git a/source/textures.c b/source/textures.c index 8533e38f0f..a16d58cfec 100644 --- a/source/textures.c +++ b/source/textures.c @@ -101,6 +101,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt SceGxmTextureFormat tex_format; uint8_t data_bpp = 0; + uint8_t fast_store = GL_FALSE; // Support for legacy GL1.0 internalFormat switch (internalFormat) { @@ -157,7 +158,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt switch (type) { case GL_UNSIGNED_BYTE: data_bpp = 3; - read_cb = readRGB; + if (internalFormat == GL_RGB) fast_store = GL_TRUE; + else read_cb = readRGB; break; default: error = GL_INVALID_ENUM; @@ -168,7 +170,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt switch (type) { case GL_UNSIGNED_BYTE: data_bpp = 4; - read_cb = readRGBA; + if (internalFormat == GL_RGBA) fast_store = GL_TRUE; + else read_cb = readRGBA; break; case GL_UNSIGNED_SHORT_5_5_5_1: data_bpp = 2; @@ -236,7 +239,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt tex->type = internalFormat; tex->write_cb = write_cb; if (level == 0) - if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb); + if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb, fast_store); else gpu_alloc_compressed_texture(width, height, tex_format, data, tex, data_bpp, read_cb); else { gpu_alloc_mipmaps(level, tex); diff --git a/source/utils/gpu_utils.c b/source/utils/gpu_utils.c index 17ff1fb82f..8497a8de7d 100644 --- a/source/utils/gpu_utils.c +++ b/source/utils/gpu_utils.c @@ -255,7 +255,7 @@ void gpu_free_texture(texture *tex) { tex->valid = 0; } -void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t)) { +void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store) { // If there's already a texture in passed texture object we first dealloc it if (tex->valid) gpu_free_texture(tex); @@ -274,13 +274,22 @@ void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const int i, j; uint8_t *src = (uint8_t *)data; uint8_t *dst; - for (i = 0; i < h; i++) { - dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i; - for (j = 0; j < w; j++) { - uint32_t clr = read_cb(src); - write_cb(dst, clr); - src += src_bpp; - dst += bpp; + if (fast_store) { // Internal Format and Data Format are the same, we can just use memcpy for better performance + uint32_t line_size = w * bpp; + for (i = 0; i < h; i++) { + dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i; + memcpy(dst, src, line_size); + src += line_size; + } + } else { // Different internal and data formats, we need to go with slower callbacks system + for (i = 0; i < h; i++) { + dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i; + for (j = 0; j < w; j++) { + uint32_t clr = read_cb(src); + write_cb(dst, clr); + src += src_bpp; + dst += bpp; + } } } } else diff --git a/source/utils/gpu_utils.h b/source/utils/gpu_utils.h index d5a93efc87..15deb02c5e 100644 --- a/source/utils/gpu_utils.h +++ b/source/utils/gpu_utils.h @@ -82,7 +82,7 @@ void gpu_pool_init(uint32_t temp_pool_size); int tex_format_to_bytespp(SceGxmTextureFormat format); // Alloc a texture -void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t)); +void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store); // Alloc a compresseed texture void gpu_alloc_compressed_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *)); diff --git a/source/vitaGL.h b/source/vitaGL.h index 4cfe1d5721..6498d8dc5b 100644 --- a/source/vitaGL.h +++ b/source/vitaGL.h @@ -378,6 +378,7 @@ void vglVertexPointerMapped(const GLvoid *pointer); // VGL_EXT_gxp_shaders extension implementation void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type); +void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset); void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer); void vglVertexAttribPointerMapped(GLuint index, const GLvoid *pointer); From b2f61389ff5c99fe7d7b36b52294a5b284c9127a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Jos=C3=A9=20Garc=C3=ADa=20Garc=C3=ADa?= Date: Sun, 31 May 2020 17:24:11 +0200 Subject: [PATCH 2/3] [VITA] Remove math-neon --- deps/math-neon/.gitattributes | 17 - deps/math-neon/.gitignore | 26 -- deps/math-neon/Makefile | 29 -- deps/math-neon/README | 169 ---------- deps/math-neon/source/math_acosf.c | 67 ---- deps/math-neon/source/math_asinf.c | 183 ----------- deps/math-neon/source/math_atan2f.c | 170 ---------- deps/math-neon/source/math_atanf.c | 149 --------- deps/math-neon/source/math_ceilf.c | 71 ----- deps/math-neon/source/math_cosf.c | 50 --- deps/math-neon/source/math_coshf.c | 120 ------- deps/math-neon/source/math_expf.c | 135 -------- deps/math-neon/source/math_fabsf.c | 58 ---- deps/math-neon/source/math_floorf.c | 66 ---- deps/math-neon/source/math_fmodf.c | 100 ------ deps/math-neon/source/math_invsqrtf.c | 79 ----- deps/math-neon/source/math_ldexpf.c | 67 ---- deps/math-neon/source/math_log10f.c | 135 -------- deps/math-neon/source/math_logf.c | 135 -------- deps/math-neon/source/math_mat2.c | 95 ------ deps/math-neon/source/math_mat3.c | 131 -------- deps/math-neon/source/math_mat4.c | 144 --------- deps/math-neon/source/math_modf.c | 71 ----- deps/math-neon/source/math_neon.h | 435 -------------------------- deps/math-neon/source/math_powf.c | 182 ----------- deps/math-neon/source/math_runfast.c | 42 --- deps/math-neon/source/math_sincosf.c | 163 ---------- deps/math-neon/source/math_sinf.c | 128 -------- deps/math-neon/source/math_sinfv.c | 110 ------- deps/math-neon/source/math_sinhf.c | 120 ------- deps/math-neon/source/math_sqrtf.c | 105 ------- deps/math-neon/source/math_sqrtfv.c | 147 --------- deps/math-neon/source/math_tanf.c | 156 --------- deps/math-neon/source/math_tanhf.c | 95 ------ deps/math-neon/source/math_vec2.c | 118 ------- deps/math-neon/source/math_vec3.c | 172 ---------- deps/math-neon/source/math_vec4.c | 126 -------- 37 files changed, 4366 deletions(-) delete mode 100644 deps/math-neon/.gitattributes delete mode 100644 deps/math-neon/.gitignore delete mode 100644 deps/math-neon/Makefile delete mode 100644 deps/math-neon/README delete mode 100644 deps/math-neon/source/math_acosf.c delete mode 100644 deps/math-neon/source/math_asinf.c delete mode 100644 deps/math-neon/source/math_atan2f.c delete mode 100644 deps/math-neon/source/math_atanf.c delete mode 100644 deps/math-neon/source/math_ceilf.c delete mode 100644 deps/math-neon/source/math_cosf.c delete mode 100644 deps/math-neon/source/math_coshf.c delete mode 100644 deps/math-neon/source/math_expf.c delete mode 100644 deps/math-neon/source/math_fabsf.c delete mode 100644 deps/math-neon/source/math_floorf.c delete mode 100644 deps/math-neon/source/math_fmodf.c delete mode 100644 deps/math-neon/source/math_invsqrtf.c delete mode 100644 deps/math-neon/source/math_ldexpf.c delete mode 100644 deps/math-neon/source/math_log10f.c delete mode 100644 deps/math-neon/source/math_logf.c delete mode 100644 deps/math-neon/source/math_mat2.c delete mode 100644 deps/math-neon/source/math_mat3.c delete mode 100644 deps/math-neon/source/math_mat4.c delete mode 100644 deps/math-neon/source/math_modf.c delete mode 100644 deps/math-neon/source/math_neon.h delete mode 100644 deps/math-neon/source/math_powf.c delete mode 100644 deps/math-neon/source/math_runfast.c delete mode 100644 deps/math-neon/source/math_sincosf.c delete mode 100644 deps/math-neon/source/math_sinf.c delete mode 100644 deps/math-neon/source/math_sinfv.c delete mode 100644 deps/math-neon/source/math_sinhf.c delete mode 100644 deps/math-neon/source/math_sqrtf.c delete mode 100644 deps/math-neon/source/math_sqrtfv.c delete mode 100644 deps/math-neon/source/math_tanf.c delete mode 100644 deps/math-neon/source/math_tanhf.c delete mode 100644 deps/math-neon/source/math_vec2.c delete mode 100644 deps/math-neon/source/math_vec3.c delete mode 100644 deps/math-neon/source/math_vec4.c diff --git a/deps/math-neon/.gitattributes b/deps/math-neon/.gitattributes deleted file mode 100644 index bdb0cabc87..0000000000 --- a/deps/math-neon/.gitattributes +++ /dev/null @@ -1,17 +0,0 @@ -# Auto detect text files and perform LF normalization -* text=auto - -# Custom for Visual Studio -*.cs diff=csharp - -# Standard to msysgit -*.doc diff=astextplain -*.DOC diff=astextplain -*.docx diff=astextplain -*.DOCX diff=astextplain -*.dot diff=astextplain -*.DOT diff=astextplain -*.pdf diff=astextplain -*.PDF diff=astextplain -*.rtf diff=astextplain -*.RTF diff=astextplain diff --git a/deps/math-neon/.gitignore b/deps/math-neon/.gitignore deleted file mode 100644 index 6b55e9b64e..0000000000 --- a/deps/math-neon/.gitignore +++ /dev/null @@ -1,26 +0,0 @@ -*.o -*.a - -# Windows thumbnail cache files -Thumbs.db -ehthumbs.db -ehthumbs_vista.db - -# Folder config file -Desktop.ini - -# Recycle Bin used on file shares -$RECYCLE.BIN/ - -# Windows Installer files -*.cab -*.msi -*.msm -*.msp - -# Windows shortcuts -*.lnk - -# ========================= -# Operating System Files -# ========================= diff --git a/deps/math-neon/Makefile b/deps/math-neon/Makefile deleted file mode 100644 index 269d8cdd57..0000000000 --- a/deps/math-neon/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -TARGET := libmathneon -SOURCES := source - -LIBS = -lc -lm -lSceGxm_stub -lSceDisplay_stub - -CFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c)) -CGFILES := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg)) -HEADERS := $(CGFILES:.cg=.h) -OBJS := $(CFILES:.c=.o) - -PREFIX = arm-vita-eabi -CC = $(PREFIX)-gcc -AR = $(PREFIX)-gcc-ar -CFLAGS = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize -ASFLAGS = $(CFLAGS) - -all: $(TARGET).a - -$(TARGET).a: $(OBJS) - $(AR) -rc $@ $^ - -clean: - @rm -rf $(TARGET).a $(TARGET).elf $(OBJS) - -install: $(TARGET).a - @mkdir -p $(VITASDK)/$(PREFIX)/lib/ - cp $(TARGET).a $(VITASDK)/$(PREFIX)/lib/ - @mkdir -p $(VITASDK)/$(PREFIX)/include/ - cp source/math_neon.h $(VITASDK)/$(PREFIX)/include/ diff --git a/deps/math-neon/README b/deps/math-neon/README deleted file mode 100644 index 8740253a58..0000000000 --- a/deps/math-neon/README +++ /dev/null @@ -1,169 +0,0 @@ - -Library: MATH-NEON -By: Lachlan Tychsen-Smith -Licence: MIT (expat) -======================================================================================= -This project implements the cmath functions and some optimised matrix functions -with the aim of increasing the floating point performance of ARM Cortex A-8 -based platforms. As well as implementing the functions in ARM NEON assembly, -they sacrifice error checking and some accuracy to achieve better performance. - -Function Errors: -======================================================================================= -The measurement and characterisations of the inaccuracies present within these -functions is really a field within itself. For the benchmark i provide the -maximum absolute, maximum relative and root mean squared error compared to the -cmath implementations over the specified range. However these values can be -misleading, especially for functions which quickly go to infinity. So its always a -good idea to test it within your actual program. In general, this library will not -be as accurate as cmath, however for many functions it is close enough to be -negilible. - -Notes: -======================================================================================= -- The *_c functions are c implementations of the *_neon code. -- Like cmath, The errors present in the functions are very dependent on the - range which your operating in. So you should test them first. -- Look in the "math_neon.h" file for discriptions of the functions. In some - function files there are also notes on the specific implementation. -- The *_neon functions make certain assumptions about the location of arguments - that is incompatible with inlining. - -Contact: -======================================================================================= -Name: Lachlan Tychsen-Smith -Email: lachlan.ts@gmail.com - -PSVITA performances test results: - -RUNFAST: Enabled ------------------------------------------------------------------------------------------------------- -MATRIX FUNCTION TESTS ------------------------------------------------------------------------------------------------------- -matmul2_c = - |-7.16, 9.42| - |17.86, -10.70| -matmul2_neon = - |-7.16, 9.42| - |17.86, -10.70| -matmul2: c=183985 neon=87480 rate=2.10 -matvec2_c = |-7.16, 17.86| -matvec2_neon = |-7.16, 17.86| -matvec2: c=98178 neon=66040 rate=1.49 -matmul3_c = - |11.14, -0.78, -3.98| - |16.56, 17.96, 23.58| - |8.73, -0.18, 1.57| -matmul3_neon = - |11.14, -0.78, -3.98| - |16.56, 17.96, 23.58| - |8.73, -0.18, 1.57| -matmul3: c=551838 neon=340292 rate=1.62 -matvec3_c = |11.14, 16.56, 8.73| -matvec3_neon = |11.14, 16.56, 8.73| -matvec3: c=98178 neon=66040 rate=1.49 -matmul4_c = - |17.91, -23.96, 1.86, 16.53| - |4.10, -18.16, 4.17, 29.06| - |6.92, -1.60, 3.12, 27.81| - |-15.13, -7.46, -17.91, 22.49| -matmul4_neon = - |17.91, -23.96, 1.86, 16.53| - |4.10, -18.16, 4.17, 29.06| - |6.92, -1.60, 3.12, 27.81| - |-15.13, -7.46, -17.91, 22.49| -matmul4: c=1316131 neon=315444 rate=4.17 -matvec4_c = |17.91, 4.10, 6.92, -15.126419| -matvec4_neon = |17.91, 4.10, 6.92, -15.126419| -matvec4: c=98178 neon=66040 rate=1.49 - -dot2_c = 5.804099 -dot2_neon = 5.804099 -dot2: c=291526 neon=307025 rate=0.95 -normalize2_c = [0.97, 0.24] -normalize2_neon = [0.97, 0.24] -normalize2: c=1058588 neon=965696 rate=1.10 - -dot3_c = -0.817487 -dot3_neon = -0.817487 -dot3: c=322094 neon=444834 rate=0.72 -normalize3_c = [0.50, 0.12, -0.86] -normalize3_neon = [0.50, 0.12, -0.86] -normalize3: c=1257201 neon=1134375 rate=1.11 -cross3_c = [-13.16, -17.29, -10.19] -cross3_neon = [-13.16, -17.29, -10.19] -cross3: c=705298 neon=766477 rate=0.92 - -dot4_c = -7.880241 -dot4_neon = -7.880241 -dot4: c=414431 neon=506460 rate=0.82 -normalize4_c = [0.45, 0.11, -0.77, -0.44] -normalize4_neon = [0.45, 0.11, -0.77, -0.44] -normalize4: c=1410727 neon=1102802 rate=1.28 - ------------------------------------------------------------------------------------------------------- -CMATH FUNCTION TESTS ------------------------------------------------------------------------------------------------------- -Function Range Number ABS Max Error REL Max Error RMS Error Time Rate ------------------------------------------------------------------------------------------------------- -sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 647042739 x1.00 -sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 646276691 x1.00 -sinf_neon [-3.14, 3.14] 500000 1.00e+00 1.00e+02% 7.07e-01 645546381 x1.00 -cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 644742077 x1.00 -cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 643957358 x1.00 -cosf_neon [-3.14, 3.14] 500000 1.00e+00 1.00e+02% 7.06e-01 643211256 x1.00 -tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 642444112 x1.00 -tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 641628507 x1.00 -tanf_neon [-0.79, 0.79] 500000 1.00e+00 1.00e+02% nan 640740514 x1.00 -asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 639560380 x1.00 -asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 638453383 x1.00 -asinf_neon [-1.00, 1.00] 500000 1.57e+00 1.00e+02% 6.84e-01 637349653 x1.00 -acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 636078992 x1.00 -acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 634934201 x1.00 -acosf_neon [-1.00, 1.00] 500000 1.57e+00 1.02e+05% 6.84e-01 633793585 x1.00 -atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 632835241 x1.00 -atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 632142823 x1.00 -atanf_neon [-1.00, 1.00] 500000 7.85e-01 0.00e+00% nan 631387330 x1.00 -sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 630142014 x1.00 -sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 628992714 x1.00 -sinhf_neon [-3.14, 3.14] 500000 1.15e+01 1.00e+02% 4.55e+00 627998454 x1.00 -coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 626869866 x1.00 -coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 625829657 x1.00 -coshf_neon [-3.14, 3.14] 500000 1.06e+01 9.14e+01% 3.92e+00 624873969 x1.00 -tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 623689093 x1.00 -tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 622547097 x1.00 -tanhf_neon [-3.14, 3.14] 500000 9.96e-01 1.00e+02% 8.26e-01 621506812 x1.00 -expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 620497304 x1.00 -expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 619569554 x1.00 -expf_neon [0.00, 10.00] 500000 2.20e+04 1.00e+02% 4.92e+03 618761400 x1.00 -logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 617882765 x1.00 -logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 617087810 x1.00 -logf_neon [1.00, 1000.00] 500000 9.49e+01 inf% 9.39e+01 616388420 x1.00 -log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 615405364 x1.00 -log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 614442585 x1.00 -log10f_neon [1.00, 1000.00] 500000 4.12e+01 inf% 4.07e+01 613671782 x1.00 -floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 611113689 x1.00 -floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 608159325 x1.00 -floorf_neon [1.00, 1000.00] 5000000 2.00e+00 2.00e+02% 1.42e-02 604769008 x1.01 -ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 601342443 x1.00 -ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 598387998 x1.00 -ceilf_neon [1.00, 1000.00] 5000000 2.00e+00 1.00e+02% 1.02e+00 594959710 x1.01 -fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 592068236 x1.00 -fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 589808748 x1.00 -fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 587712180 x1.01 -sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 586496654 x1.00 -sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 585470866 x1.00 -sqrtf_neon [1.00, 1000.00] 500000 0.00e+00 0.00e+00% nan 584594551 x1.00 -invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 583492213 x1.00 -invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 582448164 x1.00 -invsqrtf_neon [1.00, 1000.00] 500000 0.00e+00 0.00e+00% nan 581642365 x1.00 -atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 83594269 x1.00 -atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 85383651 x0.98 -atan2f_neon [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 87387055 x0.96 -powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 93430489 x1.00 -powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 96726976 x0.97 -powf_neon [1.00, 10.00] 10000 9.97e+09 1.00e+02% 0.00e+00 100185753 x0.93 -fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 101653673 x1.00 -fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 103177551 x0.99 -fmodf_neon [1.00, 10.00] 10000 9.99e+00 1.00e+02% 0.00e+00 104771240 x0.97 - diff --git a/deps/math-neon/source/math_acosf.c b/deps/math-neon/source/math_acosf.c deleted file mode 100644 index 59a22b2985..0000000000 --- a/deps/math-neon/source/math_acosf.c +++ /dev/null @@ -1,67 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -/* -Test func : acosf(x) -Test Range: -1.0 < x < 1.0 -Peak Error: ~0.005% -RMS Error: ~0.001% -*/ - -const float __acosf_pi_2 = M_PI_2; - -float acosf_c(float x) -{ - return __acosf_pi_2 - asinf_c(x); -} - - -float acosf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asinf_neon_hfp(x); - asm volatile ( - "vdup.f32 d1, %0 \n\t" //d1 = {pi/2, pi/2}; - "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; - ::"r"(__acosf_pi_2): - ); -#endif -} - -float acosf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - acosf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return acosf_c(x); -#endif -} - - - diff --git a/deps/math-neon/source/math_asinf.c b/deps/math-neon/source/math_asinf.c deleted file mode 100644 index 0ae8ef9b84..0000000000 --- a/deps/math-neon/source/math_asinf.c +++ /dev/null @@ -1,183 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -/* -Test func : asinf(x) -Test Range: -1.0 < x < 1.0 -Peak Error: ~0.005% -RMS Error: ~0.001% -*/ - - -const float __asinf_lut[4] = { - 0.105312459675071, //p7 - 0.169303418571894, //p3 - 0.051599985887214, //p5 - 0.999954835104825 //p1 -}; - -const float __asinf_pi_2 = M_PI_2; - -float asinf_c(float x) -{ - - float a, b, c, d, r, ax; - int m; - - union { - float f; - int i; - } xx; - - ax = fabs(x); - d = 0.5; - d = d - ax*0.5; - - //fast invsqrt approx - xx.f = d; - xx.i = 0x5F3759DF - (xx.i >> 1); //VRSQRTE - c = d * xx.f; - b = (3.0f - c * xx.f) * 0.5; //VRSQRTS - xx.f = xx.f * b; - c = d * xx.f; - b = (3.0f - c * xx.f) * 0.5; - xx.f = xx.f * b; - - //fast inverse approx - d = xx.f; - m = 0x3F800000 - (xx.i & 0x7F800000); - xx.i = xx.i + m; - xx.f = 1.41176471f - 0.47058824f * xx.f; - xx.i = xx.i + m; - b = 2.0 - xx.f * d; - xx.f = xx.f * b; - b = 2.0 - xx.f * d; - xx.f = xx.f * b; - - //if |x|>0.5 -> x = sqrt((1-x)/2) - xx.f = xx.f - ax; - a = (ax > 0.5f); - d = __asinf_pi_2 * a; - c = 1.0f - 3.0f * a; - ax = ax + xx.f * a; - - //polynomial evaluation - xx.f = ax * ax; - a = (__asinf_lut[0] * ax) * xx.f + (__asinf_lut[2] * ax); - b = (__asinf_lut[1] * ax) * xx.f + (__asinf_lut[3] * ax); - xx.f = xx.f * xx.f; - r = b + a * xx.f; - r = d + c * r; - - a = r + r; - b = (x < 0.0f); - r = r - a * b; - return r; -} - - -float asinf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; - "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; - "vmov.f32 d6, d0 \n\t" //d6 = d0; - "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; - - "vmov.f32 d5, #0.5 \n\t" //d5 = 0.5; - "vmls.f32 d5, d0, d5 \n\t" //d5 = d5 - d0*d5; - - //fast invsqrt approx - "vmov.f32 d1, d5 \n\t" //d1 = d5 - "vrsqrte.f32 d5, d5 \n\t" //d5 = ~ 1.0 / sqrt(d5) - "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 - "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d2) / 2 - "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 - "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 - "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d3) / 2 - "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 - - //fast reciporical approximation - "vrecpe.f32 d1, d5 \n\t" //d1 = ~ 1 / d5; - "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; - "vmul.f32 d5, d1, d2 \n\t" //d5 = d1 * d2; - - //if |x| > 0.5 -> ax = sqrt((1-ax)/2), r = pi/2 - "vsub.f32 d5, d0, d5 \n\t" //d5 = d0 - d5; - "vmov.f32 d2, #0.5 \n\t" //d2 = 0.5; - "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); - "vmov.f32 d1, #3.0 \n\t" //d5 = 3.0; - "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; - "vmov.f32 d16, #1.0 \n\t" //d16 = 1.0; - "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; - "vmls.f32 d0, d5, d3[0] \n\t" //d0 = d0 - d5 * d3[0]; - "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; - "vmls.f32 d16, d1, d3[0] \n\t" //d16 = d16 - d1 * d3; - - //polynomial: - "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} - "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} - "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} - "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} - "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} - "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} - - "vmla.f32 d7, d1, d16 \n\t" //d7 = d7 + d1*d16 - - "vadd.f32 d2, d7, d7 \n\t" //d2 = d7 + d7 - "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) - "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; - "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 - "vmls.f32 d7, d2, d3[0] \n\t" //d7 = d7 - d2 * d3[0]; - - "vmov.f32 s0, s15 \n\t" //s0 = s3 - - :: "r"(__asinf_lut), "r"(__asinf_pi_2) - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" - ); -#endif -} - - -float asinf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - asinf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return asinf_c(x); -#endif -} - - - - diff --git a/deps/math-neon/source/math_atan2f.c b/deps/math-neon/source/math_atan2f.c deleted file mode 100644 index d076a04c04..0000000000 --- a/deps/math-neon/source/math_atan2f.c +++ /dev/null @@ -1,170 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __atan2f_lut[4] = { - -0.0443265554792128, //p7 - -0.3258083974640975, //p3 - +0.1555786518463281, //p5 - +0.9997878412794807 //p1 -}; - -const float __atan2f_pi_2 = M_PI_2; - -float atan2f_c(float y, float x) -{ - float a, b, c, r, xx; - int m; - union { - float f; - int i; - } xinv; - - //fast inverse approximation (2x newton) - xx = fabs(x); - xinv.f = xx; - m = 0x3F800000 - (xinv.i & 0x7F800000); - xinv.i = xinv.i + m; - xinv.f = 1.41176471f - 0.47058824f * xinv.f; - xinv.i = xinv.i + m; - b = 2.0 - xinv.f * xx; - xinv.f = xinv.f * b; - b = 2.0 - xinv.f * xx; - xinv.f = xinv.f * b; - - c = fabs(y * xinv.f); - - //fast inverse approximation (2x newton) - xinv.f = c; - m = 0x3F800000 - (xinv.i & 0x7F800000); - xinv.i = xinv.i + m; - xinv.f = 1.41176471f - 0.47058824f * xinv.f; - xinv.i = xinv.i + m; - b = 2.0 - xinv.f * c; - xinv.f = xinv.f * b; - b = 2.0 - xinv.f * c; - xinv.f = xinv.f * b; - - //if |x| > 1.0 -> ax = -1/ax, r = pi/2 - xinv.f = xinv.f + c; - a = (c > 1.0f); - c = c - a * xinv.f; - r = a * __atan2f_pi_2; - - //polynomial evaluation - xx = c * c; - a = (__atan2f_lut[0] * c) * xx + (__atan2f_lut[2] * c); - b = (__atan2f_lut[1] * c) * xx + (__atan2f_lut[3] * c); - xx = xx * xx; - r = r + a * xx; - r = r + b; - - //determine quadrant and test for small x. - b = M_PI; - b = b - 2.0f * r; - r = r + (x < 0.0f) * b; - b = (fabs(x) < 0.000001f); - c = !b; - r = c * r; - r = r + __atan2f_pi_2 * b; - b = r + r; - r = r - (y < 0.0f) * b; - - return r; -} - -float atan2f_neon_hfp(float y, float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d17, d0[1] \n\t" //d17 = {x, x}; - "vdup.f32 d16, d0[0] \n\t" //d16 = {y, y}; - - //1.0 / x - "vrecpe.f32 d18, d17 \n\t" //d16 = ~ 1 / d1; - "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; - "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; - "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; - "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; - - //y * (1.0 /x) - "vmul.f32 d0, d16, d18 \n\t" //d0 = d16 * d18; - - - "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; - "vmov.f32 d6, d0 \n\t" //d6 = d0; - "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; - - //fast reciporical approximation - "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - - //if |x| > 1.0 -> ax = 1/ax, r = pi/2 - "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; - "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; - "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); - "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; - "vmls.f32 d0, d1, d3 \n\t" //d0 = d0 - d1 * d3; - "vmul.f32 d7, d3, d4 \n\t" //d7 = d3 * d4; - - //polynomial: - "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} - "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} - "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} - "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} - "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} - "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} - "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 - - "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 - "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) - "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 - "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; - - "vmov.f32 s0, s3 \n\t" //s0 = s3 - - :: "r"(__atan2f_lut), "r"(__atan2f_pi_2) - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" - ); -#endif -} - - -float atan2f_neon_sfp(float x, float y) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - asm volatile ("vmov.f32 s1, r1 \n\t"); - atan2f_neon_hfp(x, y); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return atan2f_c(y, x); -#endif -}; diff --git a/deps/math-neon/source/math_atanf.c b/deps/math-neon/source/math_atanf.c deleted file mode 100644 index c983756dd2..0000000000 --- a/deps/math-neon/source/math_atanf.c +++ /dev/null @@ -1,149 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __atanf_lut[4] = { - -0.0443265554792128, //p7 - -0.3258083974640975, //p3 - +0.1555786518463281, //p5 - +0.9997878412794807 //p1 -}; - -const float __atanf_pi_2 = M_PI_2; - -float atanf_c(float x) -{ - - float a, b, r, xx; - int m; - - union { - float f; - int i; - } xinv, ax; - - ax.f = fabs(x); - - //fast inverse approximation (2x newton) - xinv.f = ax.f; - m = 0x3F800000 - (xinv.i & 0x7F800000); - xinv.i = xinv.i + m; - xinv.f = 1.41176471f - 0.47058824f * xinv.f; - xinv.i = xinv.i + m; - b = 2.0 - xinv.f * ax.f; - xinv.f = xinv.f * b; - b = 2.0 - xinv.f * ax.f; - xinv.f = xinv.f * b; - - //if |x| > 1.0 -> ax = -1/ax, r = pi/2 - xinv.f = xinv.f + ax.f; - a = (ax.f > 1.0f); - ax.f = ax.f - a * xinv.f; - r = a * __atanf_pi_2; - - //polynomial evaluation - xx = ax.f * ax.f; - a = (__atanf_lut[0] * ax.f) * xx + (__atanf_lut[2] * ax.f); - b = (__atanf_lut[1] * ax.f) * xx + (__atanf_lut[3] * ax.f); - xx = xx * xx; - b = b + a * xx; - r = r + b; - - //if x < 0 -> r = -r - a = 2 * r; - b = (x < 0.0f); - r = r - a * b; - - return r; -} - - -float atanf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; - - "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; - "vmov.f32 d6, d0 \n\t" //d6 = d0; - "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; - - //fast reciporical approximation - "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - - - //if |x| > 1.0 -> ax = -1/ax, r = pi/2 - "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; - "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; - "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); - "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); - "vcvt.f32.u32 d3, d3 \n\t" //d5 = (float) d3; - "vmls.f32 d0, d1, d3[0] \n\t" //d0 = d0 - d1 * d3[0]; - "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; - - //polynomial: - "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} - "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} - "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} - "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} - "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} - "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} - "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 - - "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 - "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) - "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); - "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 - "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; - - "vmov.f32 s0, s3 \n\t" //s0 = s3 - - :: "r"(__atanf_lut), "r"(__atanf_pi_2) - : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" - ); - -#endif -} - - -float atanf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vdup.f32 d0, r0 \n\t"); - atanf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return atanf_c(x); -#endif -}; - - - diff --git a/deps/math-neon/source/math_ceilf.c b/deps/math-neon/source/math_ceilf.c deleted file mode 100644 index 1432efee73..0000000000 --- a/deps/math-neon/source/math_ceilf.c +++ /dev/null @@ -1,71 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Assumes the floating point value |x| < 2147483648 -*/ - -#include "math.h" -#include "math_neon.h" - -float ceilf_c(float x) -{ - int n; - float r; - n = (int) x; - r = (float) n; - r = r + (x > r); - return r; -} - -float ceilf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; - "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; - "vcgt.f32 d0, d0, d1 \n\t" //d0 = (d0 > d1); - "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; - "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; - "vadd.f32 d0, d1, d0 \n\t" //d0 = d1 + d0; - - ::: "d0", "d1" - ); - -#endif -} - -float ceilf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - ceilf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return ceilf_c(x); -#endif -}; - - diff --git a/deps/math-neon/source/math_cosf.c b/deps/math-neon/source/math_cosf.c deleted file mode 100644 index cb14498069..0000000000 --- a/deps/math-neon/source/math_cosf.c +++ /dev/null @@ -1,50 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math_neon.h" - -float cosf_c(float x) -{ - return sinf_c(x + M_PI_2); -} - -float cosf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - float xx = x + M_PI_2; - return sinf_neon_hfp(xx); -#endif -} - -float cosf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vdup.f32 d0, r0 \n\t"); - cosf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return cosf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_coshf.c b/deps/math-neon/source/math_coshf.c deleted file mode 100644 index a779b6a7be..0000000000 --- a/deps/math-neon/source/math_coshf.c +++ /dev/null @@ -1,120 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __coshf_rng[2] = { - 1.442695041f, - 0.693147180f -}; - -const float __coshf_lut[16] = { - 0.00019578093328483123, //p7 - 0.00019578093328483123, //p7 - 0.0014122663401803872, //p6 - 0.0014122663401803872, //p6 - 0.008336936973260111, //p5 - 0.008336936973260111, //p5 - 0.04165989275009526, //p4 - 0.04165989275009526, //p4 - 0.16666570253074878, //p3 - 0.16666570253074878, //p3 - 0.5000006143673624, //p2 - 0.5000006143673624, //p2 - 1.000000059694879, //p1 - 1.000000059694879, //p1 - 0.9999999916728642, //p0 - 0.9999999916728642 //p0 -}; - - -float coshf_c(float x) -{ - float a, b, xx; - xx = -x; - a = expf_c(x); - b = expf_c(xx); - a = a * 0.5f; - a = a + 0.5f * b; - return a; -} - - -float coshf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} - "fnegs s1, s1 \n\t" //s1 = -s1 - - //Range Reduction: - "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} - "vld1.32 {d16, d17}, [%1]! \n\t" - "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] - "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 - "vld1.32 {d18}, [%1]! \n\t" - "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 - "vld1.32 {d19}, [%1]! \n\t" - "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] - "vld1.32 {d20}, [%1]! \n\t" - - //polynomial: - "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; - "vld1.32 {d21}, [%1]! \n\t" - "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; - "vld1.32 {d22}, [%1]! \n\t" - "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; - "vld1.32 {d23}, [%1]! \n\t" - "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; - "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; - "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; - "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; - - //multiply by 2 ^ m - "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 - "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 - - "vdup.f32 d2, d0[1] \n\t" //d2 = s1 - "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 - "vadd.f32 d0, d0, d2 \n\t" //d0 = d0 + d2 - "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 - - :: "r"(__coshf_rng), "r"(__coshf_lut) - : "d0", "d1", "q1", "q2", "d6" - ); - -#endif -} - -float coshf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - coshf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return coshf_c(x); -#endif -}; diff --git a/deps/math-neon/source/math_expf.c b/deps/math-neon/source/math_expf.c deleted file mode 100644 index 011b9495bd..0000000000 --- a/deps/math-neon/source/math_expf.c +++ /dev/null @@ -1,135 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Based on: - - e ^ x = (1+m) * (2^n) - x = log(1+m) + n * log(2) - n = (int) (x * 1.0 / log(2)) - (1+m) = e ^ (x - n * log(2)) - (1+m) = Poly(x - n * log(2)) - - where Poly(x) is the Minimax approximation of e ^ x over the - range [-Log(2), Log(2)] - -Test func : expf(x) -Test Range: 0 < x < 50 -Peak Error: ~0.00024% -RMS Error: ~0.00007% -*/ - -#include "math.h" -#include "math_neon.h" - -const float __expf_rng[2] = { - 1.442695041f, - 0.693147180f -}; - -const float __expf_lut[8] = { - 0.9999999916728642, //p0 - 0.04165989275009526, //p4 - 0.5000006143673624, //p2 - 0.0014122663401803872, //p6 - 1.000000059694879, //p1 - 0.008336936973260111, //p5 - 0.16666570253074878, //p3 - 0.00019578093328483123 //p7 -}; - -float expf_c(float x) -{ - float a, b, c, d, xx; - int m; - - union { - float f; - int i; - } r; - - //Range Reduction: - m = (int) (x * __expf_rng[0]); - x = x - ((float) m) * __expf_rng[1]; - - //Taylor Polynomial (Estrins) - a = (__expf_lut[4] * x) + (__expf_lut[0]); - b = (__expf_lut[6] * x) + (__expf_lut[2]); - c = (__expf_lut[5] * x) + (__expf_lut[1]); - d = (__expf_lut[7] * x) + (__expf_lut[3]); - xx = x * x; - a = a + b * xx; - c = c + d * xx; - xx = xx* xx; - r.f = a + c * xx; - - //multiply by 2 ^ m - m = m << 23; - r.i = r.i + m; - - return r.f; -} - -float expf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} - - //Range Reduction: - "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} - "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] - "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 - "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 - "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] - - //polynomial: - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} - "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; - "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] - "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] - "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} - "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] - - //multiply by 2 ^ m - "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 - "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 - - :: "r"(__expf_rng), "r"(__expf_lut) - : "d0", "d1", "q1", "q2", "d6" - ); -#endif -} - -float expf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - expf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return expf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_fabsf.c b/deps/math-neon/source/math_fabsf.c deleted file mode 100644 index c22244704f..0000000000 --- a/deps/math-neon/source/math_fabsf.c +++ /dev/null @@ -1,58 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math_neon.h" - - -float fabsf_c(float x) -{ - union { - int i; - float f; - } xx; - - xx.f = x; - xx.i = xx.i & 0x7FFFFFFF; - return xx.f; -} - -float fabsf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "fabss s0, s0 \n\t" //s0 = fabs(s0) - ); -#endif -} - -float fabsf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "bic r0, r0, #0x80000000 \n\t" //r0 = r0 & ~(1 << 31) - ); -#else - return fabsf_c(x); -#endif -} diff --git a/deps/math-neon/source/math_floorf.c b/deps/math-neon/source/math_floorf.c deleted file mode 100644 index 091709140e..0000000000 --- a/deps/math-neon/source/math_floorf.c +++ /dev/null @@ -1,66 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Assumes the floating point value |x| < 2147483648 -*/ - -#include "math.h" -#include "math_neon.h" - -float floorf_c(float x) -{ - int n; - float r; - n = (int) x; - r = (float) n; - r = r - (r > x); - return r; -} - -float floorf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; - "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; - "vcgt.f32 d0, d1, d0 \n\t" //d0 = (d1 > d0); - "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; - "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; - "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; - ::: "d0", "d1" - ); -#endif -} - -float floorf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - floorf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return floorf_c(x); -#endif -}; diff --git a/deps/math-neon/source/math_fmodf.c b/deps/math-neon/source/math_fmodf.c deleted file mode 100644 index 86af55da34..0000000000 --- a/deps/math-neon/source/math_fmodf.c +++ /dev/null @@ -1,100 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Assumes the floating point value |x / y| < 2,147,483,648 -*/ - -#include "math_neon.h" - -float fmodf_c(float x, float y) -{ - int n; - union { - float f; - int i; - } yinv; - float a; - - //fast reciporical approximation (4x Newton) - yinv.f = y; - n = 0x3F800000 - (yinv.i & 0x7F800000); - yinv.i = yinv.i + n; - yinv.f = 1.41176471f - 0.47058824f * yinv.f; - yinv.i = yinv.i + n; - a = 2.0 - yinv.f * y; - yinv.f = yinv.f * a; - a = 2.0 - yinv.f * y; - yinv.f = yinv.f * a; - a = 2.0 - yinv.f * y; - yinv.f = yinv.f * a; - a = 2.0 - yinv.f * y; - yinv.f = yinv.f * a; - - n = (int)(x * yinv.f); - x = x - ((float)n) * y; - return x; -} - - -float fmodf_neon_hfp(float x, float y) -{ -#ifdef __MATH_NEON - asm volatile ( - "vdup.f32 d1, d0[1] \n\t" //d1[0] = y - "vdup.f32 d0, d0[0] \n\t" //d1[0] = y - - //fast reciporical approximation - "vrecpe.f32 d2, d1 \n\t" //d2 = ~1.0 / d1 - "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; - "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; - "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; - "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; - "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; - "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; - "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; - "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; - - "vmul.f32 d2, d2, d0 \n\t" //d2 = d2 * d0; - "vcvt.s32.f32 d2, d2 \n\t" //d2 = (int) d2; - "vcvt.f32.s32 d2, d2 \n\t" //d2 = (float) d2; - "vmls.f32 d0, d1, d2 \n\t" //d0 = d0 - d1 * d2; - - ::: "d0", "d1", "d2", "d3" - ); -#endif -} - - -float fmodf_neon_sfp(float x, float y) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - asm volatile ("vmov.f32 s1, r1 \n\t"); - fmodf_neon_hfp(x, y); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return fmodf_c(x,y); -#endif -}; diff --git a/deps/math-neon/source/math_invsqrtf.c b/deps/math-neon/source/math_invsqrtf.c deleted file mode 100644 index c4d2b1d52a..0000000000 --- a/deps/math-neon/source/math_invsqrtf.c +++ /dev/null @@ -1,79 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -float invsqrtf_c(float x) -{ - - float b, c; - union { - float f; - int i; - } a; - - //fast invsqrt approx - a.f = x; - a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; //VRSQRTS - a.f = a.f * b; - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; - a.f = a.f * b; - - return a.f; -} - -float invsqrtf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 - - ::: "d0", "d1", "d2", "d3" - ); -#endif -} - -float invsqrtf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - invsqrtf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return invsqrtf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_ldexpf.c b/deps/math-neon/source/math_ldexpf.c deleted file mode 100644 index 673158958f..0000000000 --- a/deps/math-neon/source/math_ldexpf.c +++ /dev/null @@ -1,67 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -float ldexpf_c(float m, int e) -{ - union { - float f; - int i; - } r; - r.f = m; - r.i += (e << 23); - return r.f; -} - -float ldexpf_neon_hfp(float m, int e) -{ -#ifdef __MATH_NEON - float r; - asm volatile ( - "lsl r0, r0, #23 \n\t" //r0 = r0 << 23 - "vdup.i32 d1, r0 \n\t" //d1 = {r0, r0} - "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 - ::: "d0", "d1" - ); -#endif -} - -float ldexpf_neon_sfp(float m, int e) -{ -#ifdef __MATH_NEON - float r; - asm volatile ( - "lsl r1, r1, #23 \n\t" //r1 = r1 << 23 - "vdup.f32 d0, r0 \n\t" //d0 = {r0, r0} - "vdup.i32 d1, r1 \n\t" //d1 = {r1, r1} - "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 - "vmov.f32 r0, s0 \n\t" //r0 = s0 - ::: "d0", "d1" - ); -#else - return ldexpf_c(m,e); -#endif -} diff --git a/deps/math-neon/source/math_log10f.c b/deps/math-neon/source/math_log10f.c deleted file mode 100644 index f68912f0fe..0000000000 --- a/deps/math-neon/source/math_log10f.c +++ /dev/null @@ -1,135 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Based on: - - log10(x) = log10((1+m) * (2^n)) - log(x) = n * log10(2) + log10(1 + m) - log(1+m) = Poly(1+m) - - where Poly(x) is the Minimax approximation of log10(x) over the - range [1, 2] - -Test func : log10f(x) -Test Range: 1 < x < 10000 -Peak Error: ~0.000040% -RMS Error: ~0.000008% -*/ - -#include "math.h" -#include "math_neon.h" - -const float __log10f_rng = 0.3010299957f; - -const float __log10f_lut[8] = { - -0.99697286229624, //p0 - -1.07301643912502, //p4 - -2.46980061535534, //p2 - -0.07176870463131, //p6 - 2.247870219989470, //p1 - 0.366547581117400, //p5 - 1.991005185100089, //p3 - 0.006135635201050, //p7 -}; - -float log10f_c(float x) -{ - float a, b, c, d, xx; - int m; - - union { - float f; - int i; - } r; - - //extract exponent - r.f = x; - m = (r.i >> 23); - m = m - 127; - r.i = r.i - (m << 23); - - //Taylor Polynomial (Estrins) - xx = r.f * r.f; - a = (__log10f_lut[4] * r.f) + (__log10f_lut[0]); - b = (__log10f_lut[6] * r.f) + (__log10f_lut[2]); - c = (__log10f_lut[5] * r.f) + (__log10f_lut[1]); - d = (__log10f_lut[7] * r.f) + (__log10f_lut[3]); - a = a + b * xx; - c = c + d * xx; - xx = xx * xx; - r.f = a + c * xx; - - //add exponent - r.f = r.f + ((float) m) * __log10f_rng; - - return r.f; -} - -float log10f_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; - - //extract exponent - "vmov.i32 d2, #127 \n\t" //d2 = 127; - "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; - "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; - "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; - "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; - - //polynomial: - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} - "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; - "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] - "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] - "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} - "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] - - //add exponent - "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} - "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 - "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 - - "vmov.f32 s0, s4 \n\t" //s0 = s4 - - :: "r"(__log10f_rng), "r"(__log10f_lut) - : "d0", "d1", "q1", "q2", "d6", "d7" - ); -#endif -} - - -float log10f_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - log10f_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return log10f_c(x); -#endif -}; diff --git a/deps/math-neon/source/math_logf.c b/deps/math-neon/source/math_logf.c deleted file mode 100644 index 61761363e5..0000000000 --- a/deps/math-neon/source/math_logf.c +++ /dev/null @@ -1,135 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Based on: - - log(x) = log((1+m) * (2^n)) - log(x) = n * log(2) + log(1 + m) - log(1+m) = Poly(1+m) - - where Poly(x) is the Minimax approximation of log(x) over the - range [1, 2] - -Test func : logf(x) -Test Range: 1 < x < 10000 -Peak Error: ~0.000601% -RMS Error: ~0.000005% -*/ - -#include "math.h" -#include "math_neon.h" - -const float __logf_rng = 0.693147180f; - -const float __logf_lut[8] = { - -2.295614848256274, //p0 - -2.470711633419806, //p4 - -5.686926051100417, //p2 - -0.165253547131978, //p6 - +5.175912446351073, //p1 - +0.844006986174912, //p5 - +4.584458825456749, //p3 - +0.014127821926000 //p7 -}; - -float logf_c(float x) -{ - float a, b, c, d, xx; - int m; - - union { - float f; - int i; - } r; - - //extract exponent - r.f = x; - m = (r.i >> 23); - m = m - 127; - r.i = r.i - (m << 23); - - //Taylor Polynomial (Estrins) - xx = r.f * r.f; - a = (__logf_lut[4] * r.f) + (__logf_lut[0]); - b = (__logf_lut[6] * r.f) + (__logf_lut[2]); - c = (__logf_lut[5] * r.f) + (__logf_lut[1]); - d = (__logf_lut[7] * r.f) + (__logf_lut[3]); - a = a + b * xx; - c = c + d * xx; - xx = xx * xx; - r.f = a + c * xx; - - //add exponent - r.f = r.f + ((float) m) * __logf_rng; - - return r.f; -} - -float logf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; - - //extract exponent - "vmov.i32 d2, #127 \n\t" //d2 = 127; - "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; - "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; - "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; - "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; - - //polynomial: - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} - "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; - "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] - "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] - "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} - "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] - - //add exponent - "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} - "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 - "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 - - "vmov.f32 s0, s4 \n\t" //s0 = s4 - - :: "r"(__logf_rng), "r"(__logf_lut) - : "d0", "d1", "q1", "q2", "d6", "d7" - ); -#endif -} - -float logf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - logf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return logf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_mat2.c b/deps/math-neon/source/math_mat2.c deleted file mode 100644 index 0baad4b771..0000000000 --- a/deps/math-neon/source/math_mat2.c +++ /dev/null @@ -1,95 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Matrices are specified in column major format: - -| a c | -| b d | - -therefore m[2] = c -*/ - -#include "math_neon.h" - -//matrix matrix multipication. d = m0 * m1; -void -matmul2_c(float m0[4], float m1[4], float d[4]) -{ - d[0] = m0[0]*m1[0] + m0[2]*m1[1]; - d[1] = m0[1]*m1[0] + m0[3]*m1[1]; - d[2] = m0[0]*m1[2] + m0[2]*m1[3]; - d[3] = m0[1]*m1[2] + m0[3]*m1[3]; -} - -void -matmul2_neon(float m0[4], float m1[4], float d[4]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d0, d1}, [%0] \n\t" //Q1 = m0 - "vld1.32 {d2, d3}, [%1] \n\t" //Q2 = m1 - - "vmul.f32 d4, d0, d2[0] \n\t" //D4 = D0*D2[0] - "vmul.f32 d5, d0, d3[0] \n\t" //D5 = D0*D3[0] - "vmla.f32 d4, d1, d2[1] \n\t" //D4 += D1*D2[1] - "vmla.f32 d5, d1, d3[1] \n\t" //D5 += D1*D3[1] - - "vst1.32 {d4, d5}, [%2] \n\t" //Q4 = m+12 - :: "r"(m0), "r"(m1), "r"(d) - : "q0", "q1", "q2", "memory" - ); -#else - matmul2_c(m0, m1, d); -#endif -} - - -//matrix vector multiplication. d = m * v -void -matvec2_c(float m[4], float v[2], float d[2]) -{ - d[0] = m[0]*v[0] + m[2]*v[1]; - d[1] = m[1]*v[0] + m[3]*v[1]; -} - -void -matvec2_neon(float m[4], float v[2], float d[2]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 d0, [%1] \n\t" //d0 = v - "vld1.32 {d1, d2}, [%0] \n\t" //Q1 = m - - "vmul.f32 d3, d1, d0[0] \n\t" //Q5 = Q1*d0[0] - "vmla.f32 d3, d2, d0[1] \n\t" //Q5 += Q1*d0[1] - - "vst1.32 d3, [%2] \n\t" //Q4 = m+12 - :: "r"(m), "r"(v), "r"(d) - : "d0", "d1", "d2","d3", "memory" - ); -#else - matvec2_c(m, v, d); -#endif -} diff --git a/deps/math-neon/source/math_mat3.c b/deps/math-neon/source/math_mat3.c deleted file mode 100644 index aae178e179..0000000000 --- a/deps/math-neon/source/math_mat3.c +++ /dev/null @@ -1,131 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Matrices are specified in row major format: - -| x0 x2 | -| x1 x3 | - -therefore m[2] = x2 - -*/ - -#include "math_neon.h" - -//matrix matrix multipication. d = m0 * m1; -void -matmul3_c(float m0[9], float m1[9], float d[9]) -{ - d[0] = m0[0]*m1[0] + m0[3]*m1[1] + m0[6]*m1[2]; - d[1] = m0[1]*m1[0] + m0[4]*m1[1] + m0[7]*m1[2]; - d[2] = m0[2]*m1[0] + m0[5]*m1[1] + m0[8]*m1[2]; - d[3] = m0[0]*m1[3] + m0[3]*m1[4] + m0[6]*m1[5]; - d[4] = m0[1]*m1[3] + m0[4]*m1[4] + m0[7]*m1[5]; - d[5] = m0[2]*m1[3] + m0[5]*m1[4] + m0[8]*m1[5]; - d[6] = m0[0]*m1[6] + m0[3]*m1[7] + m0[6]*m1[8]; - d[7] = m0[1]*m1[6] + m0[4]*m1[7] + m0[7]*m1[8]; - d[8] = m0[2]*m1[6] + m0[5]*m1[7] + m0[8]*m1[8]; -} - -void -matmul3_neon(float m0[9], float m1[9], float d[9]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 - "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 - "flds s8, [%1] \n\t" //q2 = m1+8 - - "vld1.32 {d6, d7}, [%0] \n\t" //q3[0] = m0 - "add %0, %0, #12 \n\t" //q3[0] = m0 - "vld1.32 {d8, d9}, [%0] \n\t" //q4[0] = m0+12 - "add %0, %0, #12 \n\t" //q3[0] = m0 - "vld1.32 {d10}, [%0] \n\t" //q5[0] = m0+24 - "add %0, %0, #8 \n\t" //q3[0] = m0 - "flds s22, [%0] \n\t" //q2 = m1+8 - - "vmul.f32 q6, q3, d0[0] \n\t" //q12 = q3 * d0[0] - "vmul.f32 q7, q3, d1[1] \n\t" //q13 = q3 * d2[0] - "vmul.f32 q8, q3, d3[0] \n\t" //q14 = q3 * d4[0] - "vmla.f32 q6, q4, d0[1] \n\t" //q12 = q9 * d0[1] - "vmla.f32 q7, q4, d2[0] \n\t" //q13 = q9 * d2[1] - "vmla.f32 q8, q4, d3[1] \n\t" //q14 = q9 * d4[1] - "vmla.f32 q6, q5, d1[0] \n\t" //q12 = q10 * d0[0] - "vmla.f32 q7, q5, d2[1] \n\t" //q13 = q10 * d2[0] - "vmla.f32 q8, q5, d4[0] \n\t" //q14 = q10 * d4[0] - - "vmov.f32 q0, q8 \n\t" //q14 = q10 * d4[0] - "vst1.32 {d12, d13}, [%2] \n\t" //d = q12 - "add %2, %2, #12 \n\t" //q3[0] = m0 - "vst1.32 {d14, d15}, [%2] \n\t" //d+4 = q13 - "add %2, %2, #12 \n\t" //q3[0] = m0 - "vst1.32 {d0}, [%2] \n\t" //d+8 = q14 - "add %2, %2, #8 \n\t" //q3[0] = m0 - "fsts s2, [%2] \n\t" //d = q12 - - : "+r"(m0), "+r"(m1), "+r"(d): - : "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory" - ); -#else - matmul3_c(m0, m1, d); -#endif -}; - -//matrix vector multiplication. d = m * v -void -matvec3_c(float m[9], float v[3], float d[3]) -{ - d[0] = m[0]*v[0] + m[3]*v[1] + m[6]*v[2]; - d[1] = m[1]*v[0] + m[4]*v[1] + m[7]*v[2]; - d[2] = m[2]*v[0] + m[5]*v[1] + m[8]*v[2]; -} - -void -matvec3_neon(float m[9], float v[3], float d[3]) -{ -#ifdef __MATH_NEON - int tmp; - asm volatile ( - "mov %3, #12 \n\t" //r3 = 12 - "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v - "vld1.32 {d2, d3}, [%0], %3 \n\t" //Q1 = m - "vld1.32 {d4, d5}, [%0], %3 \n\t" //Q2 = m+12 - "vld1.32 {d6, d7}, [%0], %3 \n\t" //Q3 = m+24 - - "vmul.f32 q9, q1, d0[0] \n\t" //Q9 = Q1*Q0[0] - "vmla.f32 q9, q2, d0[1] \n\t" //Q9 += Q2*Q0[1] - "vmla.f32 q9, q3, d1[0] \n\t" //Q9 += Q3*Q0[2] - "vmov.f32 q0, q9 \n\t" //Q0 = q9 - - "vst1.32 d0, [%2]! \n\t" //r2 = D24 - "fsts s2, [%2] \n\t" //r2 = D25[0] - - : "+r"(m), "+r"(v), "+r"(d), "+r"(tmp): - : "q0", "q9", "q10","q11", "q12", "q13", "memory" - ); -#else - matvec3_c(m, v, d); -#endif -} diff --git a/deps/math-neon/source/math_mat4.c b/deps/math-neon/source/math_mat4.c deleted file mode 100644 index 5bcf34b596..0000000000 --- a/deps/math-neon/source/math_mat4.c +++ /dev/null @@ -1,144 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Matrices are specified in row major format: - -| x0 x2 | -| x1 x3 | - -therefore m[2] = x2 - -*/ - -#include "math_neon.h" - -//matrix matrix multipication. d = m0 * m1; -void -matmul4_c(float m0[16], float m1[16], float d[16]) -{ - d[0] = m0[0]*m1[0] + m0[4]*m1[1] + m0[8]*m1[2] + m0[12]*m1[3]; - d[1] = m0[1]*m1[0] + m0[5]*m1[1] + m0[9]*m1[2] + m0[13]*m1[3]; - d[2] = m0[2]*m1[0] + m0[6]*m1[1] + m0[10]*m1[2] + m0[14]*m1[3]; - d[3] = m0[3]*m1[0] + m0[7]*m1[1] + m0[11]*m1[2] + m0[15]*m1[3]; - d[4] = m0[0]*m1[4] + m0[4]*m1[5] + m0[8]*m1[6] + m0[12]*m1[7]; - d[5] = m0[1]*m1[4] + m0[5]*m1[5] + m0[9]*m1[6] + m0[13]*m1[7]; - d[6] = m0[2]*m1[4] + m0[6]*m1[5] + m0[10]*m1[6] + m0[14]*m1[7]; - d[7] = m0[3]*m1[4] + m0[7]*m1[5] + m0[11]*m1[6] + m0[15]*m1[7]; - d[8] = m0[0]*m1[8] + m0[4]*m1[9] + m0[8]*m1[10] + m0[12]*m1[11]; - d[9] = m0[1]*m1[8] + m0[5]*m1[9] + m0[9]*m1[10] + m0[13]*m1[11]; - d[10] = m0[2]*m1[8] + m0[6]*m1[9] + m0[10]*m1[10] + m0[14]*m1[11]; - d[11] = m0[3]*m1[8] + m0[7]*m1[9] + m0[11]*m1[10] + m0[15]*m1[11]; - d[12] = m0[0]*m1[12] + m0[4]*m1[13] + m0[8]*m1[14] + m0[12]*m1[15]; - d[13] = m0[1]*m1[12] + m0[5]*m1[13] + m0[9]*m1[14] + m0[13]*m1[15]; - d[14] = m0[2]*m1[12] + m0[6]*m1[13] + m0[10]*m1[14] + m0[14]*m1[15]; - d[15] = m0[3]*m1[12] + m0[7]*m1[13] + m0[11]*m1[14] + m0[15]*m1[15]; -} - -void -matmul4_neon(float m0[16], float m1[16], float d[16]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 - "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 - "vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8 - "vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12 - "vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0 - "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4 - "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8 - "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12 - - "vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0] - "vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0] - "vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0] - "vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0] - "vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1] - "vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1] - "vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1] - "vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1] - "vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0] - "vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0] - "vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0] - "vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0] - "vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1] - "vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1] - "vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1] - "vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1] - - "vst1.32 {d24, d25}, [%2]! \n\t" //d = q12 - "vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13 - "vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14 - "vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15 - - : "+r"(m0), "+r"(m1), "+r"(d) : - : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", - "memory" - ); -#else - matmul4_c(m0, m1, d); -#endif -} - - -//matrix vector multiplication. d = m * v -void -matvec4_c(float m[16], float v[4], float d[4]) -{ - d[0] = m[0]*v[0] + m[4]*v[1] + m[8]*v[2] + m[12]*v[3]; - d[1] = m[1]*v[0] + m[5]*v[1] + m[9]*v[2] + m[13]*v[3]; - d[2] = m[2]*v[0] + m[6]*v[1] + m[10]*v[2] + m[14]*v[3]; - d[3] = m[3]*v[0] + m[7]*v[1] + m[11]*v[2] + m[15]*v[3]; -} - -void -matvec4_neon(float m[16], float v[4], float d[4]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v - "vld1.32 {d18, d19}, [%0]! \n\t" //Q1 = m - "vld1.32 {d20, d21}, [%0]! \n\t" //Q2 = m+4 - "vld1.32 {d22, d23}, [%0]! \n\t" //Q3 = m+8 - "vld1.32 {d24, d25}, [%0]! \n\t" //Q4 = m+12 - - "vmul.f32 q13, q9, d0[0] \n\t" //Q5 = Q1*Q0[0] - "vmla.f32 q13, q10, d0[1] \n\t" //Q5 += Q1*Q0[1] - "vmla.f32 q13, q11, d1[0] \n\t" //Q5 += Q2*Q0[2] - "vmla.f32 q13, q12, d1[1] \n\t" //Q5 += Q3*Q0[3] - - "vst1.32 {d26, d27}, [%2] \n\t" //Q4 = m+12 - : - : "r"(m), "r"(v), "r"(d) - : "q0", "q9", "q10","q11", "q12", "q13", "memory" - ); -#else - matvec4_c(m, v, d); -#endif -} - - - - - diff --git a/deps/math-neon/source/math_modf.c b/deps/math-neon/source/math_modf.c deleted file mode 100644 index f3259710af..0000000000 --- a/deps/math-neon/source/math_modf.c +++ /dev/null @@ -1,71 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Assumes the floating point value |x| < 2,147,483,648 -*/ - -#include "math_neon.h" - -float modf_c(float x, int *i) -{ - int n; - n = (int)x; - *i = n; - x = x - (float)n; - return x; -} - - -float modf_neon_hfp(float x, int *i) -{ -#ifdef __MATH_NEON - asm volatile ( - "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; - "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; - "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; - "vstr.i32 s2, [r0] \n\t" //[r0] = d1[0] - ::: "d0", "d1", "d2" - ); -#endif -} - - -float modf_neon_sfp(float x, int *i) -{ -#ifdef __MATH_NEON - asm volatile ( - "vdup.f32 d0, r0 \n\t" //d0 = {x, x} - "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; - "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; - "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; - "vstr.i32 s2, [r1] \n\t" //[r0] = d1[0] - "vmov.f32 r0, s0 \n\t" //r0 = d0[0]; - ::: "d0", "d1", "d2" - ); - -#else - return modf_c(x, i); -#endif -} diff --git a/deps/math-neon/source/math_neon.h b/deps/math-neon/source/math_neon.h deleted file mode 100644 index 66635808d0..0000000000 --- a/deps/math-neon/source/math_neon.h +++ /dev/null @@ -1,435 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#ifndef __MATH_NEON_H__ -#define __MATH_NEON_H__ - -#if !defined(__i386__) && defined(__arm__) -//if defined neon ASM routines are used, otherwise all calls to *_neon -//functions are rerouted to their equivalent *_c function. -#define __MATH_NEON - -//Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines. -//You can access the hardfp versions directly via the *_hard suffix. -//You can access the softfp versions directly via the *_soft suffix. -#define __MATH_FPABI 0 - -#endif - -#ifdef GCC -#define ALIGN(A) __attribute__ ((aligned (A)) -#else -#define ALIGN(A) -#endif - -#ifndef _MATH_H -#define M_PI 3.14159265358979323846 /* pi */ -#define M_PI_2 1.57079632679489661923 /* pi/2 */ -#define M_PI_4 0.78539816339744830962 /* pi/4 */ -#define M_E 2.7182818284590452354 /* e */ -#define M_LOG2E 1.4426950408889634074 /* log_2 e */ -#define M_LOG10E 0.43429448190325182765 /* log_10 e */ -#define M_LN2 0.69314718055994530942 /* log_e 2 */ -#define M_LN10 2.30258509299404568402 /* log_e 10 */ -#define M_1_PI 0.31830988618379067154 /* 1/pi */ -#define M_2_PI 0.63661977236758134308 /* 2/pi */ -#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */ -#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ -#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ -#endif - -#if __MATH_FPABI == 1 -#define sinf_neon sinf_neon_hfp -#define cosf_neon cosf_neon_hfp -#define sincosf_neon sincosf_neon_hfp -#define tanf_neon tanf_neon_hfp -#define atanf_neon atanf_neon_hfp -#define atan2f_neon atan2f_neon_hfp -#define asinf_neon asinf_neon_hfp -#define acosf_neon acosf_neon_hfp -#define sinhf_neon sinhf_neon_hfp -#define coshf_neon coshf_neon_hfp -#define tanhf_neon tanhf_neon_hfp -#define expf_neon expf_neon_hfp -#define logf_neon logf_neon_hfp -#define log10f_neon log10f_neon_hfp -#define powf_neon powf_neon_hfp -#define floorf_neon floorf_neon_hfp -#define ceilf_neon ceilf_neon_hfp -#define fabsf_neon fabsf_neon_hfp -#define ldexpf_neon ldexpf_neon_hfp -#define frexpf_neon frexpf_neon_hfp -#define fmodf_neon fmodf_neon_hfp -#define modf_neon modf_neon_hfp -#define sqrtf_neon sqrtf_neon_hfp -#define invsqrtf_neon invsqrtf_neon_hfp -#else -#define sinf_neon sinf_neon_sfp -#define cosf_neon cosf_neon_sfp -#define sincosf_neon sincosf_neon_sfp -#define tanf_neon tanf_neon_sfp -#define atanf_neon atanf_neon_sfp -#define atan2f_neon atan2f_neon_sfp -#define asinf_neon asinf_neon_sfp -#define acosf_neon acosf_neon_sfp -#define sinhf_neon sinhf_neon_sfp -#define coshf_neon coshf_neon_sfp -#define tanhf_neon tanhf_neon_sfp -#define expf_neon expf_neon_sfp -#define logf_neon logf_neon_sfp -#define log10f_neon log10f_neon_sfp -#define powf_neon powf_neon_sfp -#define floorf_neon floorf_neon_sfp -#define ceilf_neon ceilf_neon_sfp -#define fabsf_neon fabsf_neon_sfp -#define ldexpf_neon ldexpf_neon_sfp -#define frexpf_neon frexpf_neon_sfp -#define fmodf_neon fmodf_neon_sfp -#define modf_neon modf_neon_sfp -#define sqrtf_neon sqrtf_neon_sfp -#define invsqrtf_neon invsqrtf_neon_sfp - -#define dot2_neon dot2_neon_sfp -#define dot3_neon dot3_neon_sfp -#define dot4_neon dot4_neon_sfp -#endif - -/* -function: enable_runfast - this function enables the floating point runfast mode on the - ARM Cortex A8. -*/ -void enable_runfast(); - - -float dot2_c(float v0[2], float v1[2]); -float dot2_neon(float v0[2], float v1[2]); -float dot3_c(float v0[3], float v1[3]); -float dot3_neon(float v0[3], float v1[3]); -float dot4_c(float v0[4], float v1[4]); -float dot4_neon(float v0[4], float v1[4]); - -void cross3_c(float v0[3], float v1[3], float d[3]); -void cross3_neon(float v0[3], float v1[3], float d[3]); - -void normalize2_c(float v[2], float d[2]); -void normalize2_neon(float v[2], float d[2]); -void normalize3_c(float v[3], float d[3]); -void normalize3_neon(float v[3], float d[3]); -void normalize4_c(float v[4], float d[4]); -void normalize4_neon(float v[4], float d[4]); - -/* -function: matmul2 -arguments: m0 2x2 matrix, m1 2x2 matrix -return: d 2x2 matrix -expression: d = m0 * m1 -*/ -void matmul2_c(float m0[4], float m1[4], float d[4]); -void matmul2_neon(float m0[4], float m1[4], float d[4]); - -/* -function: matmul3 -arguments: m0 3x3 matrix, m1 3x3 matrix -return: d 3x3 matrix -expression: d = m0 * m1 -*/ -void matmul3_c(float m0[9], float m1[9], float d[9]); -void matmul3_neon(float m0[9], float m1[9], float d[9]); - -/* -function: matmul4 -arguments: m0 4x4 matrix, m1 4x4 matrix -return: d 4x4 matrix -expression: d = m0 * m1 -*/ -void matmul4_c(float m0[16], float m1[16], float d[16]); -void matmul4_neon(float m0[16], float m1[16], float d[16]); - -/* -function: matvec2 -arguments: m 2x2 matrix, v 2 element vector -return: d 2x2 matrix -expression: d = m * v -*/ -void matvec2_c(float m[4], float v[2], float d[2]); -void matvec2_neon(float m[4], float v[2], float d[2]); - -/* -function: matvec3 -arguments: m 3x3 matrix, v 3 element vector -return: d 3x3 matrix -expression: d = m * v -*/ -void matvec3_c(float m[9], float v[3], float d[3]); -void matvec3_neon(float m[9], float v[3], float d[3]); - -/* -function: matvec4 -arguments: m 4x4 matrix, v 4 element vector -return: d 4x4 matrix -expression: d = m * v -*/ -void matvec4_c(float m[16], float v[4], float d[4]); -void matvec4_neon(float m[16], float v[4], float d[4]); - -/* -function: sinf -arguments: x radians -return: the sine function evaluated at x radians. -expression: r = sin(x) -*/ -float sinf_c(float x); -float sinf_neon_hfp(float x); -float sinf_neon_sfp(float x); - -/* -function: cosf -arguments: x radians -return: the cosine function evaluated at x radians. -expression: r = cos(x) -notes: computed using cos(x) = sin(x + pi/2) -*/ -float cosf_c(float x); -float cosf_neon_hfp(float x); -float cosf_neon_sfp(float x); - -/* -function: sincosf -arguments: x radians, r[2] result array. -return: both the sine and the cosine evaluated at x radians. -expression: r = {sin(x), cos(x)} -notes: faster than evaluating seperately. -*/ -void sincosf_c(float x, float r[2]); -void sincosf_neon_hfp(float x, float r[2]); -void sincosf_neon_sfp(float x, float r[2]); - -/* -function: sinfv -return: the sine function evaluated at x[i] radians -expression: r[i] = sin(x[i]) -notes: faster than evaluating individually. - r and x can be the same memory location. -*/ -void sinfv_c(float *x, int n, float *r); -void sinfv_neon(float *x, int n, float *r); - -/* -function: tanf -return: the tangent evaluated at x radians. -expression: r = tan(x) -notes: computed using tan(x) = sin(x) / cos(x) -*/ -float tanf_c(float x); -float tanf_neon_hfp(float x); -float tanf_neon_sfp(float x); - -/* -function: atanf -return: the arctangent evaluated at x. -expression: r = atan(x) -*/ -float atanf_c(float x); -float atanf_neon_hfp(float x); -float atanf_neon_sfp(float x); - -/* -function: atanf -return: the arctangent evaluated at x. -expression: r = atan(x) -*/ -float atan2f_c(float y, float x); -float atan2f_neon_hfp(float y, float x); -float atan2f_neon_sfp(float y, float x); - -/* -function: asinf -return: the arcsine evaluated at x. -expression: r = asin(x) -*/ -float asinf_c(float x); -float asinf_neon_hfp(float x); -float asinf_neon_sfp(float x); - -/* -function: acosf -return: the arcsine evaluated at x. -expression: r = asin(x) -*/ -float acosf_c(float x); -float acosf_neon_hfp(float x); -float acosf_neon_sfp(float x); - -/* -function: sinhf -return: the arcsine evaluated at x. -expression: r = asin(x) -*/ -float sinhf_c(float x); -float sinhf_neon_hfp(float x); -float sinhf_neon_sfp(float x); - -/* -function: coshf -return: the arcsine evaluated at x. -expression: r = asin(x) -*/ -float coshf_c(float x); -float coshf_neon_hfp(float x); -float coshf_neon_sfp(float x); - -/* -function: tanhf -return: the arcsine evaluated at x. -expression: r = asin(x) -*/ -float tanhf_c(float x); -float tanhf_neon_hfp(float x); -float tanhf_neon_sfp(float x); - -/* -function: expf -return: the natural exponential evaluated at x. -expression: r = e ** x -*/ -float expf_c(float x); -float expf_neon_hfp(float x); -float expf_neon_sfp(float x); - -/* -function: logf -return: the value of the natural logarithm of x. -expression: r = ln(x) -notes: assumes x > 0 -*/ -float logf_c(float x); -float logf_neon_hfp(float x); -float logf_neon_sfp(float x); - -/* -function: log10f -return: the value of the power 10 logarithm of x. -expression: r = log10(x) -notes: assumes x > 0 -*/ -float log10f_c(float x); -float log10f_neon_hfp(float x); -float log10f_neon_sfp(float x); - -/* -function: powf -return: x raised to the power of n, x ** n. -expression: r = x ** y -notes: computed using e ** (y * ln(x)) -*/ -float powf_c(float x, float n); -float powf_neon_sfp(float x, float n); -float powf_neon_hfp(float x, float n); - -/* -function: floorf -return: x rounded down (towards negative infinity) to its nearest - integer value. -notes: assumes |x| < 2 ** 31 -*/ -float floorf_c(float x); -float floorf_neon_sfp(float x); -float floorf_neon_hfp(float x); - -/* -function: ceilf -return: x rounded up (towards positive infinity) to its nearest - integer value. -notes: assumes |x| < 2 ** 31 -*/ -float ceilf_c(float x); -float ceilf_neon_hfp(float x); -float ceilf_neon_sfp(float x); - -/* -function: fabsf -return: absolute vvalue of x -notes: assumes |x| < 2 ** 31 -*/ -float fabsf_c(float x); -float fabsf_neon_hfp(float x); -float fabsf_neon_sfp(float x); - -/* -function: ldexpf -return: the value of m multiplied by 2 to the power of e. -expression: r = m * (2 ** e) -*/ -float ldexpf_c(float m, int e); -float ldexpf_neon_hfp(float m, int e); -float ldexpf_neon_sfp(float m, int e); - -/* -function: frexpf -return: the exponent and mantissa of x -*/ -float frexpf_c(float x, int *e); -float frexpf_neon_hfp(float x, int *e); -float frexpf_neon_sfp(float x, int *e); - -/* -function: fmodf -return: the remainder of x divided by y, x % y -expression: r = x - floor(x / y) * y; -notes: assumes that |x / y| < 2 ** 31 -*/ -float fmodf_c(float x, float y); -float fmodf_neon_hfp(float x, float y); -float fmodf_neon_sfp(float x, float y); - -/* -function: modf -return: breaks x into the integer (i) and fractional part (return) -notes: assumes that |x| < 2 ** 31 -*/ -float modf_c(float x, int *i); -float modf_neon_hfp(float x, int *i); -float modf_neon_sfp(float x, int *i); - -/* -function: sqrtf -return: (x^0.5) -notes: -*/ -float sqrtf_c(float x); -float sqrtf_neon_hfp(float x); -float sqrtf_neon_sfp(float x); - - -/* -function: invsqrtf -return: 1.0f / (x^0.5) -notes: -*/ -float invsqrtf_c(float x); -float invsqrtf_neon_hfp(float x); -float invsqrtf_neon_sfp(float x); - -#endif diff --git a/deps/math-neon/source/math_powf.c b/deps/math-neon/source/math_powf.c deleted file mode 100644 index 6faed4eeac..0000000000 --- a/deps/math-neon/source/math_powf.c +++ /dev/null @@ -1,182 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Based on x ^ n = exp(n * log(x)) - -Test func : powf(x, n) -Test Range: (1,1) < (x, n) < (10, 10) -Peak Error: ~0.0010% -RMS Error: ~0.0002% -*/ - -#include "math.h" -#include "math_neon.h" - -const float __powf_rng[2] = { - 1.442695041f, - 0.693147180f -}; - -const float __powf_lut[16] = { - -2.295614848256274, //p0 log - -2.470711633419806, //p4 - -5.686926051100417, //p2 - -0.165253547131978, //p6 - +5.175912446351073, //p1 - +0.844006986174912, //p5 - +4.584458825456749, //p3 - +0.014127821926000, //p7 - 0.9999999916728642, //p0 exp - 0.04165989275009526, //p4 - 0.5000006143673624, //p2 - 0.0014122663401803872, //p6 - 1.000000059694879, //p1 - 0.008336936973260111, //p5 - 0.16666570253074878, //p3 - 0.00019578093328483123 //p7 -}; - -float powf_c(float x, float n) -{ - float a, b, c, d, xx; - int m; - - union { - float f; - int i; - } r; - - //extract exponent - r.f = x; - m = (r.i >> 23); - m = m - 127; - r.i = r.i - (m << 23); - - //Taylor Polynomial (Estrins) - xx = r.f * r.f; - a = (__powf_lut[4] * r.f) + (__powf_lut[0]); - b = (__powf_lut[6] * r.f) + (__powf_lut[2]); - c = (__powf_lut[5] * r.f) + (__powf_lut[1]); - d = (__powf_lut[7] * r.f) + (__powf_lut[3]); - a = a + b * xx; - c = c + d * xx; - xx = xx * xx; - r.f = a + c * xx; - - //add exponent - r.f = r.f + ((float) m) * __powf_rng[1]; - - r.f = r.f * n; - - - //Range Reduction: - m = (int) (r.f * __powf_rng[0]); - r.f = r.f - ((float) m) * __powf_rng[1]; - - //Taylor Polynomial (Estrins) - a = (__powf_lut[12] * r.f) + (__powf_lut[8]); - b = (__powf_lut[14] * r.f) + (__powf_lut[10]); - c = (__powf_lut[13] * r.f) + (__powf_lut[9]); - d = (__powf_lut[15] * r.f) + (__powf_lut[11]); - xx = r.f * r.f; - a = a + b * xx; - c = c + d * xx; - xx = xx* xx; - r.f = a + c * xx; - - //multiply by 2 ^ m - m = m << 23; - r.i = r.i + m; - - return r.f; -} - -float powf_neon_hfp(float x, float n) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d16, d0[1] \n\t" //d16 = {y,y}; - "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; - - //extract exponent - "vmov.i32 d2, #127 \n\t" //d2 = 127; - "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; - "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; - "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; - "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; - - //polynomial: - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} - "vld1.32 {d2, d3, d4, d5}, [%1]! \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; - "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] - "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] - "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} - "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] - - //add exponent - "vld1.32 d7, [%0] \n\t" //d7 = {invrange, range} - "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 - "vmla.f32 d2, d6, d7[1] \n\t" //d2 = d2 + d6 * d7[1] - - "vdup.f32 d0, d2[0] \n\t" //d0 = d2[0] - "vmul.f32 d0, d0, d16 \n\t" //d0 = d0 * d16 - - //Range Reduction: - "vmul.f32 d6, d0, d7[0] \n\t" //d6 = d0 * d7[0] - "vcvt.u32.f32 d6, d6 \n\t" //d6 = (int) d6 - "vcvt.f32.u32 d1, d6 \n\t" //d1 = (float) d6 - "vmls.f32 d0, d1, d7[1] \n\t" //d0 = d0 - d1 * d7[1] - - //polynomial: - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} - "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; - "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] - "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] - "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} - "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] - - //multiply by 2 ^ m - "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 - "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 - - - :: "r"(__powf_rng), "r"(__powf_lut) - : "d0", "d1", "d2","d3", "d4", "d5", "d6", "d7" - ); -#endif -} - -float powf_neon_sfp(float x, float n) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - asm volatile ("vmov.f32 s1, r1 \n\t"); - powf_neon_hfp(x, n); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return powf_c(x, n); -#endif -}; diff --git a/deps/math-neon/source/math_runfast.c b/deps/math-neon/source/math_runfast.c deleted file mode 100644 index 0d06c0bfc8..0000000000 --- a/deps/math-neon/source/math_runfast.c +++ /dev/null @@ -1,42 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -void -enable_runfast() -{ -#ifdef __arm__ - static const unsigned int x = 0x04086060; - static const unsigned int y = 0x03000000; - int r; - asm volatile ( - "fmrx %0, fpscr \n\t" //r0 = FPSCR - "and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060 - "orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000 - "fmxr fpscr, %0 \n\t" //FPSCR = r0 - : "=r"(r) - : "r"(x), "r"(y) - ); -#endif -} diff --git a/deps/math-neon/source/math_sincosf.c b/deps/math-neon/source/math_sincosf.c deleted file mode 100644 index 365826f8ff..0000000000 --- a/deps/math-neon/source/math_sincosf.c +++ /dev/null @@ -1,163 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __sincosf_rng[2] = { - 2.0 / M_PI, - M_PI / 2.0 -}; - -const float __sincosf_lut[8] = { - -0.00018365f, //p7 - -0.00018365f, //p7 - +0.00830636f, //p5 - +0.00830636f, //p5 - -0.16664831f, //p3 - -0.16664831f, //p3 - +0.99999661f, //p1 - +0.99999661f, //p1 -}; - -void sincosf_c( float x, float r[2]) -{ - union { - float f; - int i; - } ax, bx; - - float y; - float a, b, c, d, xx, yy; - int m, n, o, p; - - y = x + __sincosf_rng[1]; - ax.f = fabsf(x); - bx.f = fabsf(y); - - //Range Reduction: - m = (int) (ax.f * __sincosf_rng[0]); - o = (int) (bx.f * __sincosf_rng[0]); - ax.f = ax.f - (((float)m) * __sincosf_rng[1]); - bx.f = bx.f - (((float)o) * __sincosf_rng[1]); - - //Test Quadrant - n = m & 1; - p = o & 1; - ax.f = ax.f - n * __sincosf_rng[1]; - bx.f = bx.f - p * __sincosf_rng[1]; - m = m >> 1; - o = o >> 1; - n = n ^ m; - p = p ^ o; - m = (x < 0.0); - o = (y < 0.0); - n = n ^ m; - p = p ^ o; - n = n << 31; - p = p << 31; - ax.i = ax.i ^ n; - bx.i = bx.i ^ p; - - //Taylor Polynomial - xx = ax.f * ax.f; - yy = bx.f * bx.f; - r[0] = __sincosf_lut[0]; - r[1] = __sincosf_lut[1]; - r[0] = r[0] * xx + __sincosf_lut[2]; - r[1] = r[1] * yy + __sincosf_lut[3]; - r[0] = r[0] * xx + __sincosf_lut[4]; - r[1] = r[1] * yy + __sincosf_lut[5]; - r[0] = r[0] * xx + __sincosf_lut[6]; - r[1] = r[1] * yy + __sincosf_lut[7]; - r[0] = r[0] * ax.f; - r[1] = r[1] * bx.f; - -} - -void sincosf_neon_hfp(float x, float r[2]) -{ -//HACK: Assumes for softfp that r1 = x, and for hardfp that s0 = x. -#ifdef __MATH_NEON - asm volatile ( - //{x, y} = {x, x + pi/2} - "vdup.f32 d1, d0[0] \n\t" //d1 = {x, x} - "vld1.32 d3, [%1] \n\t" //d3 = {invrange, range} - "vadd.f32 d0, d1, d3 \n\t" //d0 = d1 + d3 - "vmov.f32 s0, s2 \n\t" //d0[0] = d1[0] - "vabs.f32 d1, d0 \n\t" //d1 = {abs(x), abs(y)} - - //Range Reduction: - "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] - "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 - "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 - "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] - - //Checking Quadrant: - //ax = ax - (k&1) * M_PI_2 - "vmov.i32 d4, #1 \n\t" //d4 = 1 - "vand.i32 d4, d4, d2 \n\t" //d4 = d4 & d2 - "vcvt.f32.u32 d5, d4 \n\t" //d5 = (float) d4 - "vmls.f32 d1, d5, d3[1] \n\t" //d1 = d1 - d5 * d3[1] - - //ax = ax ^ ((k & 1) ^ (k >> 1) ^ (x < 0) << 31) - "vshr.u32 d3, d2, #1 \n\t" //d3 = d2 >> 1 - "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 - "vclt.f32 d3, d0, #0 \n\t" //d3 = (d0 < 0.0) - "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 - "vshl.i32 d4, d4, #31 \n\t" //d4 = d4 << 31 - "veor.i32 d0, d1, d4 \n\t" //d0 = d1 ^ d4 - - //polynomial: - "vldm %2!, {d2, d3} \n\t" //d2 = {p7, p7}, d3 = {p5, p5}, r3 += 4; - "vmul.f32 d1, d0, d0 \n\t" //d1 = d0 * d0 = {x^2, y^2} - "vldm %2!, {d4} \n\t" //d4 = {p3, p3}, r3 += 2; - "vmla.f32 d3, d2, d1 \n\t" //d3 = d3 + d2 * d1; - "vldm %2!, {d5} \n\t" //d5 = {p1, p1}, r3 += 2; - "vmla.f32 d4, d3, d1 \n\t" //d4 = d4 + d3 * d1; - "vmla.f32 d5, d4, d1 \n\t" //d5 = d5 + d4 * d1; - "vmul.f32 d5, d5, d0 \n\t" //d5 = d5 * d0; - - "vstm.f32 %0, {d5} \n\t" //r[0] = d5[0], r[1]=d5[1]; - - : "+r"(r) - : "r"(__sincosf_rng), "r"(__sincosf_lut) - : "d0", "d1", "d2", "d3", "d4", "d5" - ); -#else - sincosf_c(x, r); -#endif -} - -void sincosf_neon_sfp(float x, float r[2]) -{ -#ifdef __MATH_NEON - asm volatile ("vdup.f32 d0, r0 \n\t"); - sincosf_neon_hfp(x, r); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - sincosf_c(x, r); -#endif -}; - diff --git a/deps/math-neon/source/math_sinf.c b/deps/math-neon/source/math_sinf.c deleted file mode 100644 index 257f219672..0000000000 --- a/deps/math-neon/source/math_sinf.c +++ /dev/null @@ -1,128 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include -#include "math_neon.h" - -static const float __sinf_rng[2] = { - 2.0 / M_PI, - M_PI / 2.0 -} ALIGN(16); - -static const float __sinf_lut[4] = { - -0.00018365f, //p7 - -0.16664831f, //p3 - +0.00830636f, //p5 - +0.99999661f, //p1 -} ALIGN(16); - -float sinf_c(float x) -{ - union { - float f; - int i; - } ax; - - float r, a, b, xx; - int m, n; - - ax.f = fabsf(x); - - //Range Reduction: - m = (int) (ax.f * __sinf_rng[0]); - ax.f = ax.f - (((float)m) * __sinf_rng[1]); - - //Test Quadrant - n = m & 1; - ax.f = ax.f - n * __sinf_rng[1]; - m = m >> 1; - n = n ^ m; - m = (x < 0.0); - n = n ^ m; - n = n << 31; - ax.i = ax.i ^ n; - - //Taylor Polynomial (Estrins) - xx = ax.f * ax.f; - a = (__sinf_lut[0] * ax.f) * xx + (__sinf_lut[2] * ax.f); - b = (__sinf_lut[1] * ax.f) * xx + (__sinf_lut[3] * ax.f); - xx = xx * xx; - r = b + a * xx; - - return r; -} - -float sinf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} - "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} - - "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] - "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 - "vmov.i32 d5, #1 \n\t" //d5 = 1 - "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 - "vshr.u32 d7, d2, #1 \n\t" //d7 = d2 >> 1 - "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] - - "vand.i32 d5, d2, d5 \n\t" //d5 = d2 & d5 - "vclt.f32 d18, d0, #0 \n\t" //d18 = (d0 < 0.0) - "vcvt.f32.u32 d6, d5 \n\t" //d6 = (float) d5 - "vmls.f32 d1, d6, d3[1] \n\t" //d1 = d1 - d6 * d3[1] - "veor.i32 d5, d5, d7 \n\t" //d5 = d5 ^ d7 - "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} - - "vld1.32 {d16, d17}, [%1] \n\t" //q8 = {p7, p3, p5, p1} - "veor.i32 d5, d5, d18 \n\t" //d5 = d5 ^ d18 - "vshl.i32 d5, d5, #31 \n\t" //d5 = d5 << 31 - "veor.i32 d1, d1, d5 \n\t" //d1 = d1 ^ d5 - - "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} - "vmul.f32 q0, q8, d1[0] \n\t" //q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x} - "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} - "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7} - - "vmov.f32 s0, s3 \n\t" //s0 = s3 - : - : "r"(__sinf_rng), "r"(__sinf_lut) - : "q0", "q1", "q2", "q3", "q8", "q9" - ); -#endif -} - -float sinf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vdup.f32 d0, r0 \n\t"); - sinf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return sinf_c(x); -#endif - -}; - diff --git a/deps/math-neon/source/math_sinfv.c b/deps/math-neon/source/math_sinfv.c deleted file mode 100644 index 0dfc878170..0000000000 --- a/deps/math-neon/source/math_sinfv.c +++ /dev/null @@ -1,110 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __sinfv_rng[2] = { - 2.0 / M_PI, - M_PI / 2.0, -}; - -const float __sinfv_lut[4] = { - -0.00018365f, //p7 - -0.16664831f, //p3 - +0.00830636f, //p5 - +0.99999661f, //p1 -}; - -void sinfv_c(float *x, int n, float *r) -{ - union { - float f; - int i; - } ax, bx; - - float aa, ab, ba, bb, axx, bxx; - int am, bm, an, bn; - - if (n & 0x1) { - *r++ = sinf_c(*x++); - n--; - } - - float rng0 = __sinfv_rng[0]; - float rng1 = __sinfv_rng[1]; - - while(n > 0){ - - float x0 = *x++; - float x1 = *x++; - - ax.f = fabsf(x0); - bx.f = fabsf(x1); - - //Range Reduction: - am = (int) (ax.f * rng0); - bm = (int) (bx.f * rng0); - - ax.f = ax.f - (((float)am) * rng1); - bx.f = bx.f - (((float)bm) * rng1); - - //Test Quadrant - an = am & 1; - bn = bm & 1; - ax.f = ax.f - an * rng1; - bx.f = bx.f - bn * rng1; - am = (am & 2) >> 1; - bm = (bm & 2) >> 1; - ax.i = ax.i ^ ((an ^ am ^ (x0 < 0)) << 31); - bx.i = bx.i ^ ((bn ^ bm ^ (x1 < 0)) << 31); - - //Taylor Polynomial (Estrins) - axx = ax.f * ax.f; - bxx = bx.f * bx.f; - aa = (__sinfv_lut[0] * ax.f) * axx + (__sinfv_lut[2] * ax.f); - ba = (__sinfv_lut[0] * bx.f) * bxx + (__sinfv_lut[2] * bx.f); - ab = (__sinfv_lut[1] * ax.f) * axx + (__sinfv_lut[3] * ax.f); - bb = (__sinfv_lut[1] * bx.f) * bxx + (__sinfv_lut[3] * bx.f); - axx = axx * axx; - bxx = bxx * bxx; - *r++ = ab + aa * axx; - *r++ = bb + ba * bxx; - n -= 2; - } - - -} - -void sinfv_neon(float *x, int n, float *r) -{ -#ifdef __MATH_NEON - asm volatile ("" - : - :"r"(x), "r"(n) - ); -#else - sinfv_c(x, n, r); -#endif -} diff --git a/deps/math-neon/source/math_sinhf.c b/deps/math-neon/source/math_sinhf.c deleted file mode 100644 index 820a490dae..0000000000 --- a/deps/math-neon/source/math_sinhf.c +++ /dev/null @@ -1,120 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __sinhf_rng[2] = { - 1.442695041f, - 0.693147180f -}; - -const float __sinhf_lut[16] = { - 0.00019578093328483123, //p7 - 0.00019578093328483123, //p7 - 0.0014122663401803872, //p6 - 0.0014122663401803872, //p6 - 0.008336936973260111, //p5 - 0.008336936973260111, //p5 - 0.04165989275009526, //p4 - 0.04165989275009526, //p4 - 0.16666570253074878, //p3 - 0.16666570253074878, //p3 - 0.5000006143673624, //p2 - 0.5000006143673624, //p2 - 1.000000059694879, //p1 - 1.000000059694879, //p1 - 0.9999999916728642, //p0 - 0.9999999916728642 //p0 -}; - - -float sinhf_c(float x) -{ - float a, b, xx; - xx = -x; - a = expf_c(x); - b = expf_c(xx); - a = a - b; - a = a * 0.5f; - return a; -} - - -float sinhf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} - "fnegs s1, s1 \n\t" //s1 = -s1 - - //Range Reduction: - "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} - "vld1.32 {d16, d17}, [%1]! \n\t" - "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] - "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 - "vld1.32 {d18}, [%1]! \n\t" - "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 - "vld1.32 {d19}, [%1]! \n\t" - "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] - "vld1.32 {d20}, [%1]! \n\t" - - //polynomial: - "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; - "vld1.32 {d21}, [%1]! \n\t" - "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; - "vld1.32 {d22}, [%1]! \n\t" - "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; - "vld1.32 {d23}, [%1]! \n\t" - "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; - "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; - "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; - "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; - - //multiply by 2 ^ m - "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 - "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 - - "vdup.f32 d2, d0[1] \n\t" //d2 = s1 - "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 - "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2 - "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 - - :: "r"(__sinhf_rng), "r"(__sinhf_lut) - : "d0", "d1", "q1", "q2", "d6" - ); - -#endif -} - -float sinhf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - sinhf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return sinhf_c(x); -#endif -}; diff --git a/deps/math-neon/source/math_sqrtf.c b/deps/math-neon/source/math_sqrtf.c deleted file mode 100644 index ee3f86bdbf..0000000000 --- a/deps/math-neon/source/math_sqrtf.c +++ /dev/null @@ -1,105 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -/* -Test func : sqrtf(x) -Test Range: 0 < x < 1,000,000,000 -Peak Error: ~0.0010% -RMS Error: ~0.0005% -*/ - -#include "math.h" -#include "math_neon.h" - -float sqrtf_c(float x) -{ - - float b, c; - int m; - union { - float f; - int i; - } a; - - //fast invsqrt approx - a.f = x; - a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; //VRSQRTS - a.f = a.f * b; - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; - a.f = a.f * b; - - //fast inverse approx - x = a.f; - m = 0x3F800000 - (a.i & 0x7F800000); - a.i = a.i + m; - a.f = 1.41176471f - 0.47058824f * a.f; - a.i = a.i + m; - b = 2.0 - a.f * x; - a.f = a.f * b; - b = 2.0 - a.f * x; - a.f = a.f * b; - - return a.f; -} - -float sqrtf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - //fast invsqrt approx - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - - //fast reciporical approximation - "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; - - ::: "d0", "d1", "d2", "d3" - ); -#endif -} - -float sqrtf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - sqrtf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return sqrtf_c(x); -#endif -}; diff --git a/deps/math-neon/source/math_sqrtfv.c b/deps/math-neon/source/math_sqrtfv.c deleted file mode 100644 index c647403a28..0000000000 --- a/deps/math-neon/source/math_sqrtfv.c +++ /dev/null @@ -1,147 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ -/* -Test func : sqrtf(x) -Test Range: 0 < x < 1,000,000,000 -Peak Error: ~0.0010% -RMS Error: ~0.0005% -*/ - -#include "math.h" -#include "math_neon.h" - -void sqrtfv_c(float *x, int n, float *r) -{ - - float x0, x1; - float b0, b1, c0, c1; - int m0, m1; - union { - float f; - int i; - } a0, a1; - - - if (n & 0x1){ - *r++ = sqrtf_c(*x++); - n--; - } - - while(n > 0){ - - x0 = *x++; - x1 = *x++; - - //fast invsqrt approx - a0.f = x0; - a1.f = x1; - a0.i = 0x5F3759DF - (a0.i >> 1); //VRSQRTE - a1.i = 0x5F3759DF - (a1.i >> 1); //VRSQRTE - c0 = x0 * a0.f; - c1 = x1 * a1.f; - b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS - b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS - a0.f = a0.f * b0; - a1.f = a1.f * b1; - c0 = x0 * a0.f; - c1 = x1 * a1.f; - b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS - b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS - a0.f = a0.f * b0; - a1.f = a1.f * b1; - - //fast inverse approx - c0 = a0.f; - c0 = a1.f; - m0 = 0x3F800000 - (a0.i & 0x7F800000); - m1 = 0x3F800000 - (a1.i & 0x7F800000); - a0.i = a0.i + m0; - a1.i = a1.i + m1; - a0.f = 1.41176471f - 0.47058824f * a0.f; - a1.f = 1.41176471f - 0.47058824f * a1.f; - a0.i = a0.i + m0; - a1.i = a1.i + m1; - b0 = 2.0 - a0.f * c0; - b1 = 2.0 - a1.f * c1; - a0.f = a0.f * b0; - a1.f = a1.f * b1; - b0 = 2.0 - a0.f * c0; - b1 = 2.0 - a1.f * c1; - a0.f = a0.f * b0; - a1.f = a1.f * b1; - - *r++ = a0.f; - *r++ = a1.f; - n -= 2; - - } -} - -void sqrtfv_neon(float *x, int n, float *r) -{ -#if 0 - asm volatile ( - - "tst r1, #1 \n\t" //r1 & 1 - "beq 1f \n\t" // - - "vld1.32 d0[0], [r0]! \n\t" //s0 = *x++ - "mov ip, lr \n\t" //ip = lr - //"bl sqrtf_neon_hfp \n\t" //sqrtf_neon - "mov lr, ip \n\t" //lr = ip - "vst1.32 d0[0], [r2]! \n\t" //*r++ = r0 - "subs r1, r1, #1 \n\t" //r1 = r1 - 1; - "bxeq lr \n\t" // - - "1: \n\t" // - - "vld1.32 d0, [r0]! \n\t" //d0 = (*x[0], *x[1]), x+=2; - - //fast invsqrt approx - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 - "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 - - //fast reciporical approximation - "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; - - "vst1.64 d0, [r2]! \n\t" //*r++ = d0; - "subs r1, r1, #2 \n\t" //n = n - 2; update flags - "bgt 1b \n\t" // - - ::: "d0", "d1", "d2", "d3" -); -#else - sqrtfv_c(x, n, r); -#endif -} diff --git a/deps/math-neon/source/math_tanf.c b/deps/math-neon/source/math_tanf.c deleted file mode 100644 index e87c1ffd1c..0000000000 --- a/deps/math-neon/source/math_tanf.c +++ /dev/null @@ -1,156 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -const float __tanf_rng[2] = { - 2.0 / M_PI, - M_PI / 2.0 -}; - -const float __tanf_lut[4] = { - -0.00018365f, //p7 - -0.16664831f, //p3 - +0.00830636f, //p5 - +0.99999661f, //p1 -}; - -float tanf_c(float x){ - - union { - float f; - int i; - } ax, c; - - float r, a, b, xx, cc, cx; - int m; - - ax.f = fabsf(x); - - //Range Reduction: - m = (int) (ax.f * __tanf_rng[0]); - ax.f = ax.f - (((float)m) * __tanf_rng[1]); - - //Test Quadrant - ax.f = ax.f - (m & 1) * __tanf_rng[1]; - ax.i = ax.i ^ ((*(int*)&x) & 0x80000000); - - //Taylor Polynomial (Estrins) - xx = ax.f * ax.f; - a = (__tanf_lut[0] * ax.f) * xx + (__tanf_lut[2] * ax.f); - b = (__tanf_lut[1] * ax.f) * xx + (__tanf_lut[3] * ax.f); - xx = xx * xx; - r = b + a * xx; - - //cosine - c.f = 1.0 - r * r; - - //fast invsqrt approximation (2x newton iterations) - cc = c.f; - c.i = 0x5F3759DF - (c.i >> 1); //VRSQRTE - cx = cc * c.f; - a = (3.0f - cx * c.f) / 2; //VRSQRTS - c.f = c.f * a; - cx = cc * c.f; - a = (3.0f - cx * c.f) / 2; - c.f = c.f * a; - - r = r * c.f; - - return r; -} - - -float tanf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ( - - "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} - "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} - - //Range Reduction: - "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} - "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] - "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 - "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 - "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] - - //Checking Quadrant: - //ax = ax - (k&1) * M_PI_2 - "vmov.i32 d4, #1 \n\t" //d4 = 1 - "vand.i32 d2, d2, d4 \n\t" //d2 = d2 & d4 - "vcvt.f32.u32 d2, d2 \n\t" //d2 = (float) d2 - "vmls.f32 d1, d2, d3[1] \n\t" //d1 = d1 - d2 * d3[1] - - //ax = ax ^ ( x.i & 0x800000000) - "vmov.i32 d4, #0x80000000 \n\t" //d4 = 0x80000000 - "vand.i32 d0, d0, d4 \n\t" //d0 = d0 & d4 - "veor.i32 d1, d1, d0 \n\t" //d1 = d1 ^ d0 - - //polynomial: - "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} - "vld1.32 {d4, d5}, [%1] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} - "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} - "vmul.f32 q0, q2, d1[0] \n\t" //q0 = q2 * d1[0] = {p7x, p3x, p5x, p1x} - "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} - "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {..., p1x + p3x^3 + p5x^5 + p7x^7} - - //cosine - "vmov.f32 s1, #1.0 \n\t" //d0[1] = 1.0 - "vmls.f32 d0, d1, d1 \n\t" //d0 = {..., 1.0 - sx*sx} - - //invsqrt approx - "vmov.f32 d2, d0 \n\t" //d2 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 - "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 - "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 - "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 - - "vmul.f32 d0, d0, d1 \n\t" //d0 = d0 * d1 - - "vmov.f32 s0, s1 \n\t" //s0 = s1 - - :: "r"(__tanf_rng), "r"(__tanf_lut) - : "d0", "d1", "d2", "d3", "d4", "d5" - ); -#endif -} - - -float tanf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vdup.f32 d0, r0 \n\t"); - tanf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return tanf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_tanhf.c b/deps/math-neon/source/math_tanhf.c deleted file mode 100644 index 219655be4d..0000000000 --- a/deps/math-neon/source/math_tanhf.c +++ /dev/null @@ -1,95 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math.h" -#include "math_neon.h" - -/* -TanH = (e^x - e^-x) / (e^x + e^-x) -TanH = (e^x - e^-x)(e^x) / (e^x + e^-x)(e^x) -TanH = (e^2x - 1) / (e^2x + 1) - -*/ - -float tanhf_c(float x) -{ - float a, b, c; - int m; - union{ - float f; - int i; - } xx; - - x = 2.0f * x; - a = expf_c(x); - c = a + 1.0f; - - //reciporical approx. - xx.f = c; - m = 0x3F800000 - (xx.i & 0x7F800000); - xx.i = xx.i + m; - xx.f = 1.41176471f - 0.47058824f * xx.f; - xx.i = xx.i + m; - b = 2.0 - xx.f * c; - xx.f = xx.f * b; - b = 2.0 - xx.f * c; - xx.f = xx.f * b; - c = a - 1.0; - xx.f *= c; - return xx.f; -} - - -float tanhf_neon_hfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vadd.f32 d0, d0, d0 \n\t"); - expf_neon_hfp(x); - asm volatile ( - "vmov.f32 d2, #1.0 \n\t" - "vsub.f32 d3, d0, d2 \n\t" - "vadd.f32 d0, d0, d2 \n\t" - - "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; - "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; - "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3; - ::: "d0", "d1", "d2", "d3" - ); -#endif -} - -float tanhf_neon_sfp(float x) -{ -#ifdef __MATH_NEON - asm volatile ("vmov.f32 s0, r0 \n\t"); - tanhf_neon_hfp(x); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return tanhf_c(x); -#endif -}; - diff --git a/deps/math-neon/source/math_vec2.c b/deps/math-neon/source/math_vec2.c deleted file mode 100644 index d970c37676..0000000000 --- a/deps/math-neon/source/math_vec2.c +++ /dev/null @@ -1,118 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - - -#include "math_neon.h" - -//vec2 scalar product -float -dot2_c(float v0[2], float v1[2]) -{ - float r; - r = v0[0]*v1[0]; - r += v0[1]*v1[1]; - return r; -} - -void -normalize2_c(float v[2], float d[2]) -{ - float b, c, x; - union { - float f; - int i; - } a; - - x = v[0]*v[0]; - x += v[1]*v[1]; - - //fast invsqrt approx - a.f = x; - a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; //VRSQRTS - a.f = a.f * b; - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; - a.f = a.f * b; - - d[0] = v[0]*a.f; - d[1] = v[1]*a.f; -} - -float -dot2_neon_hfp(float v0[2], float v1[2]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} - "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} - "vmul.f32 d0, d2, d4 \n\t" //d0 = d2*d4 - "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] - :: "r"(v0), "r"(v1) - : - ); -#endif -} - -float -dot2_neon_sfp(float v0[2], float v1[2]) -{ -#ifdef __MATH_NEON - dot2_neon_hfp(v0, v1); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return dot2_c(v0, v1); -#endif -}; - -void -normalize2_neon(float v[2], float d[2]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 d4, [%0] \n\t" //d4 = {x0,y0} - "vmul.f32 d0, d4, d4 \n\t" //d0 = d2*d2 - "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] - - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - - "vmul.f32 d4, d4, d0[0] \n\t" //d4 = d4*d0[0] - "vst1.32 d4, [%1] \n\t" // - - :: "r"(v), "r"(d) - : "d0", "d1", "d2", "d3", "d4", "memory" - ); -#else - normalize2_c(v, d); -#endif -} - diff --git a/deps/math-neon/source/math_vec3.c b/deps/math-neon/source/math_vec3.c deleted file mode 100644 index 998ff2e4d5..0000000000 --- a/deps/math-neon/source/math_vec3.c +++ /dev/null @@ -1,172 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math_neon.h" - -//vec4 scalar product -float -dot3_c(float v0[3], float v1[3]) -{ - float r; - r = v0[0]*v1[0]; - r += v0[1]*v1[1]; - r += v0[2]*v1[2]; - return r; -} - -void -cross3_c(float v0[3], float v1[3], float d[3]) -{ - d[0] = v0[1]*v1[2] - v0[2]*v1[1]; - d[1] = v0[2]*v1[0] - v0[0]*v1[2]; - d[2] = v0[0]*v1[1] - v0[1]*v1[0]; -} - -void -normalize3_c(float v[3], float d[3]) -{ - float b, c, x; - union { - float f; - int i; - } a; - - x = v[0]*v[0]; - x += v[1]*v[1]; - x += v[2]*v[2]; - - //fast invsqrt approx - a.f = x; - a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; //VRSQRTS - a.f = a.f * b; - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; - a.f = a.f * b; - - d[0] = v[0]*a.f; - d[1] = v[1]*a.f; - d[2] = v[2]*a.f; -} - - -float -dot3_neon_hfp(float v0[3], float v1[3]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} - "flds s6, [%0, #8] \n\t" //d3[0]={z0} - "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} - "flds s10, [%1, #8] \n\t" //d5[0]={z1} - - "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 - "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] - "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 - :: "r"(v0), "r"(v1) - : "d0","d1","d2","d3","d4","d5" - ); -#endif -} - -float -dot3_neon_sfp(float v0[3], float v1[3]) -{ -#ifdef __MATH_NEON - dot3_neon_hfp(v0, v1); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return dot3_c(v0, v1); -#endif -}; - - -void cross3_neon(float v0[3], float v1[3], float d[3]) -{ -#ifdef __MATH_NEON - asm volatile ( - "flds s3, [%0] \n\t" //d1[1]={x0} - "add %0, %0, #4 \n\t" // - "vld1.32 {d0}, [%0] \n\t" //d0={y0,z0} - "vmov.f32 s2, s1 \n\t" //d1[0]={z0} - - "flds s5, [%1] \n\t" //d2[1]={x1} - "add %1, %1, #4 \n\t" // - "vld1.32 {d3}, [%1] \n\t" //d3={y1,z1} - "vmov.f32 s4, s7 \n\t" //d2[0]=d3[1] - - "vmul.f32 d4, d0, d2 \n\t" //d4=d0*d2 - "vmls.f32 d4, d1, d3 \n\t" //d4-=d1*d3 - - "vmul.f32 d5, d3, d1[1] \n\t" //d5=d3*d1[1] - "vmls.f32 d5, d0, d2[1] \n\t" //d5-=d0*d2[1] - - "vst1.32 d4, [%2] \n\t" // - "add %2, %2, #8 \n\t" // - "fsts s10, [%2] \n\t" // - - : "+r"(v0), "+r"(v1), "+r"(d): - : "d0", "d1", "d2", "d3", "d4", "d5", "memory" - ); -#else - cross3_c(v0,v1,d); -#endif -} - -void -normalize3_neon(float v[3], float d[3]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d4}, [%0] \n\t" //d4={x0,y0} - "flds s10, [%0, #8] \n\t" //d5[0]={z0} - - "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 - "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] - "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 - - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 - - "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 - "vst1.32 {d4}, [%1] \n\t" // - "fsts s10, [%1, #8] \n\t" // - - :: "r"(v), "r"(d) - : "d0", "d1", "d2", "d3", "d4", "d5", "memory" - ); -#else - normalize3_c(v, d); -#endif - -} - - diff --git a/deps/math-neon/source/math_vec4.c b/deps/math-neon/source/math_vec4.c deleted file mode 100644 index 483fc57190..0000000000 --- a/deps/math-neon/source/math_vec4.c +++ /dev/null @@ -1,126 +0,0 @@ -/* -The MIT License (MIT) - -Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -*/ - -#include "math_neon.h" - - -#ifdef __MATH_NEON -#include "arm_neon.h" -#endif - -//vec4 scalar product -float dot4_c(float v0[4], float v1[4]) -{ - float r; - r = v0[0]*v1[0]; - r += v0[1]*v1[1]; - r += v0[2]*v1[2]; - r += v0[3]*v1[3]; - return r; -} - -void normalize4_c(float v[4], float d[4]) -{ - float b, c, x; - union { - float f; - int i; - } a; - - x = v[0]*v[0]; - x += v[1]*v[1]; - x += v[2]*v[2]; - x += v[3]*v[3]; - - //fast invsqrt approx - a.f = x; - a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; //VRSQRTS - a.f = a.f * b; - c = x * a.f; - b = (3.0f - c * a.f) * 0.5; - a.f = a.f * b; - - d[0] = v[0]*a.f; - d[1] = v[1]*a.f; - d[2] = v[2]*a.f; - d[3] = v[3]*a.f; -} - -void normalize4_neon(float v[4], float d[4]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d4, d5}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} - "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 - "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 - "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] - - "vmov.f32 d1, d0 \n\t" //d1 = d0 - "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 - "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 - "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 - "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 - - "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 - "vst1.32 {d4, d5}, [%1] \n\t" //d2={x0,y0}, d3={z0, w0} - - :: "r"(v), "r"(d) - : "d0", "d1", "d2", "d3", "d4", "d5", "memory" - ); -#else - normalize4_c(v, d); -#endif - -} - - -float dot4_neon_hfp(float v0[4], float v1[4]) -{ -#ifdef __MATH_NEON - asm volatile ( - "vld1.32 {d2, d3}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} - "vld1.32 {d4, d5}, [%1] \n\t" //d4={x1,y1}, d5={z1, w1} - "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 - "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 - "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] - :: "r"(v0), "r"(v1) : - ); -#endif -} - -float dot4_neon_sfp(float v0[4], float v1[4]) -{ -#ifdef __MATH_NEON - dot4_neon_hfp(v0, v1); - asm volatile ("vmov.f32 r0, s0 \n\t"); -#else - return dot4_c(v0, v1); -#endif -}; - From af97efdc17a1215a63a9ee639cf9361876ded50d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Francisco=20Jos=C3=A9=20Garc=C3=ADa=20Garc=C3=ADa?= Date: Sun, 31 May 2020 17:24:53 +0200 Subject: [PATCH 3/3] Squashed 'deps/math-neon/' content from commit bf34c68a8e git-subtree-dir: deps/math-neon git-subtree-split: bf34c68a8e141f7e6f37040da9311b07f1bbe529 --- .gitattributes | 17 + .gitignore | 26 ++ Makefile | 29 ++ README | 168 ++++++++++ math_debug.c | 689 +++++++++++++++++++++++++++++++++++++++++ source/math_acosf.c | 67 ++++ source/math_asinf.c | 183 +++++++++++ source/math_atan2f.c | 170 ++++++++++ source/math_atanf.c | 149 +++++++++ source/math_ceilf.c | 71 +++++ source/math_cosf.c | 50 +++ source/math_coshf.c | 120 +++++++ source/math_expf.c | 135 ++++++++ source/math_fabsf.c | 58 ++++ source/math_floorf.c | 66 ++++ source/math_fmodf.c | 100 ++++++ source/math_invsqrtf.c | 79 +++++ source/math_ldexpf.c | 67 ++++ source/math_log10f.c | 135 ++++++++ source/math_logf.c | 135 ++++++++ source/math_mat2.c | 95 ++++++ source/math_mat3.c | 131 ++++++++ source/math_mat4.c | 144 +++++++++ source/math_modf.c | 71 +++++ source/math_neon.h | 439 ++++++++++++++++++++++++++ source/math_powf.c | 182 +++++++++++ source/math_runfast.c | 42 +++ source/math_sincosf.c | 163 ++++++++++ source/math_sinf.c | 128 ++++++++ source/math_sinfv.c | 110 +++++++ source/math_sinhf.c | 120 +++++++ source/math_sqrtf.c | 105 +++++++ source/math_sqrtfv.c | 147 +++++++++ source/math_tanf.c | 156 ++++++++++ source/math_tanhf.c | 95 ++++++ source/math_vec2.c | 118 +++++++ source/math_vec3.c | 172 ++++++++++ source/math_vec4.c | 126 ++++++++ 38 files changed, 5058 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 Makefile create mode 100644 README create mode 100644 math_debug.c create mode 100644 source/math_acosf.c create mode 100644 source/math_asinf.c create mode 100644 source/math_atan2f.c create mode 100644 source/math_atanf.c create mode 100644 source/math_ceilf.c create mode 100644 source/math_cosf.c create mode 100644 source/math_coshf.c create mode 100644 source/math_expf.c create mode 100644 source/math_fabsf.c create mode 100644 source/math_floorf.c create mode 100644 source/math_fmodf.c create mode 100644 source/math_invsqrtf.c create mode 100644 source/math_ldexpf.c create mode 100644 source/math_log10f.c create mode 100644 source/math_logf.c create mode 100644 source/math_mat2.c create mode 100644 source/math_mat3.c create mode 100644 source/math_mat4.c create mode 100644 source/math_modf.c create mode 100644 source/math_neon.h create mode 100644 source/math_powf.c create mode 100644 source/math_runfast.c create mode 100644 source/math_sincosf.c create mode 100644 source/math_sinf.c create mode 100644 source/math_sinfv.c create mode 100644 source/math_sinhf.c create mode 100644 source/math_sqrtf.c create mode 100644 source/math_sqrtfv.c create mode 100644 source/math_tanf.c create mode 100644 source/math_tanhf.c create mode 100644 source/math_vec2.c create mode 100644 source/math_vec3.c create mode 100644 source/math_vec4.c diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000000..bdb0cabc87 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,17 @@ +# Auto detect text files and perform LF normalization +* text=auto + +# Custom for Visual Studio +*.cs diff=csharp + +# Standard to msysgit +*.doc diff=astextplain +*.DOC diff=astextplain +*.docx diff=astextplain +*.DOCX diff=astextplain +*.dot diff=astextplain +*.DOT diff=astextplain +*.pdf diff=astextplain +*.PDF diff=astextplain +*.rtf diff=astextplain +*.RTF diff=astextplain diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000..6b55e9b64e --- /dev/null +++ b/.gitignore @@ -0,0 +1,26 @@ +*.o +*.a + +# Windows thumbnail cache files +Thumbs.db +ehthumbs.db +ehthumbs_vista.db + +# Folder config file +Desktop.ini + +# Recycle Bin used on file shares +$RECYCLE.BIN/ + +# Windows Installer files +*.cab +*.msi +*.msm +*.msp + +# Windows shortcuts +*.lnk + +# ========================= +# Operating System Files +# ========================= diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..269d8cdd57 --- /dev/null +++ b/Makefile @@ -0,0 +1,29 @@ +TARGET := libmathneon +SOURCES := source + +LIBS = -lc -lm -lSceGxm_stub -lSceDisplay_stub + +CFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c)) +CGFILES := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg)) +HEADERS := $(CGFILES:.cg=.h) +OBJS := $(CFILES:.c=.o) + +PREFIX = arm-vita-eabi +CC = $(PREFIX)-gcc +AR = $(PREFIX)-gcc-ar +CFLAGS = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize +ASFLAGS = $(CFLAGS) + +all: $(TARGET).a + +$(TARGET).a: $(OBJS) + $(AR) -rc $@ $^ + +clean: + @rm -rf $(TARGET).a $(TARGET).elf $(OBJS) + +install: $(TARGET).a + @mkdir -p $(VITASDK)/$(PREFIX)/lib/ + cp $(TARGET).a $(VITASDK)/$(PREFIX)/lib/ + @mkdir -p $(VITASDK)/$(PREFIX)/include/ + cp source/math_neon.h $(VITASDK)/$(PREFIX)/include/ diff --git a/README b/README new file mode 100644 index 0000000000..4f388e9374 --- /dev/null +++ b/README @@ -0,0 +1,168 @@ + +Library: MATH-NEON +By: Lachlan Tychsen-Smith +Licence: MIT (expat) +======================================================================================= +This project implements the cmath functions and some optimised matrix functions +with the aim of increasing the floating point performance of ARM Cortex A-8 +based platforms. As well as implementing the functions in ARM NEON assembly, +they sacrifice error checking and some accuracy to achieve better performance. + +Function Errors: +======================================================================================= +The measurement and characterisations of the inaccuracies present within these +functions is really a field within itself. For the benchmark i provide the +maximum absolute, maximum relative and root mean squared error compared to the +cmath implementations over the specified range. However these values can be +misleading, especially for functions which quickly go to infinity. So its always a +good idea to test it within your actual program. In general, this library will not +be as accurate as cmath, however for many functions it is close enough to be +negilible. + +Notes: +======================================================================================= +- The *_c functions are c implementations of the *_neon code. +- Like cmath, The errors present in the functions are very dependent on the + range which your operating in. So you should test them first. +- Look in the "math_neon.h" file for discriptions of the functions. In some + function files there are also notes on the specific implementation. +- The *_neon functions make certain assumptions about the location of arguments + that is incompatible with inlining. + +Contact: +======================================================================================= +Name: Lachlan Tychsen-Smith +Email: lachlan.ts@gmail.com + +PSVITA performances test results: + +RUNFAST: Disabled +------------------------------------------------------------------------------------------------------ +MATRIX FUNCTION TESTS +------------------------------------------------------------------------------------------------------ +matmul2_c = + |-14.56, 5.96| + |-15.35, 10.50| +matmul2_neon = + |-14.56, 5.96| + |-15.35, 10.50| +matmul2: c=174924 neon=64490 rate=2.71 +matvec2_c = |-14.56, -15.35| +matvec2_neon = |-14.56, -15.35| +matvec2: c=88957 neon=58337 rate=1.52 +matmul3_c = + |-21.39, -4.68, -1.74| + |-8.66, -8.97, 1.83| + |15.88, 0.30, -2.23| +matmul3_neon = + |-21.39, -4.68, -1.74| + |-8.66, -8.97, 1.83| + |15.88, 0.30, -2.23| +matmul3: c=552486 neon=297268 rate=1.86 +matvec3_c = |-21.39, -8.66, 15.88| +matvec3_neon = |-21.39, -8.66, 15.88| +matvec3: c=184104 neon=128780 rate=1.43 +matmul4_c = + |-13.65, -1.80, -12.92, 6.56| + |-10.21, 9.47, 2.73, 14.79| + |0.97, 11.69, -0.64, -12.87| + |20.06, 6.77, 35.61, -0.02| +matmul4_neon = + |-13.65, -1.80, -12.92, 6.56| + |-10.21, 9.47, 2.73, 14.79| + |0.97, 11.69, -0.64, -12.87| + |20.06, 6.77, 35.61, -0.02| +matmul4: c=1315568 neon=254227 rate=5.17 +matvec4_c = |-13.65, -10.21, 0.97, 20.058556| +matvec4_neon = |-13.65, -10.21, 0.97, 20.058556| +matvec4: c=331712 neon=147196 rate=2.25 + +dot2_c = -10.903330 +dot2_neon = -10.903330 +dot2: c=230295 neon=168799 rate=1.36 +normalize2_c = [-0.74, 0.67] +normalize2_neon = [-0.74, 0.67] +normalize2: c=950716 neon=965780 rate=0.98 + +dot3_c = -4.226746 +dot3_neon = -4.226746 +dot3: c=306957 neon=337316 rate=0.91 +normalize3_c = [-0.69, 0.62, -0.38] +normalize3_neon = [-0.69, 0.62, -0.38] +normalize3: c=1180950 neon=1134557 rate=1.04 +cross3_c = [-9.67, -19.39, -14.24] +cross3_neon = [-9.67, -19.39, -14.24] +cross3: c=659558 neon=766896 rate=0.86 + +dot4_c = 2.782796 +dot4_neon = 2.782796 +dot4: c=414233 neon=276068 rate=1.50 +normalize4_c = [-0.59, 0.53, -0.32, -0.52] +normalize4_neon = [-0.59, 0.53, -0.32, -0.52] +normalize4: c=1364294 neon=1103327 rate=1.24 + +------------------------------------------------------------------------------------------------------ +CMATH FUNCTION TESTS +------------------------------------------------------------------------------------------------------ +Function Range Number ABS Max Error REL Max Error RMS Error Time Rate +------------------------------------------------------------------------------------------------------ +sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1394459996 x1.00 +sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 1395128226 x1.00 +sinf_neon [-3.14, 3.14] 500000 8.34e-07 1.00e+02% 4.09e-07 1395853554 x1.00 +cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1396644271 x1.00 +cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 1397360321 x1.00 +cosf_neon [-3.14, 3.14] 500000 8.34e-07 6.74e-01% 4.16e-07 1398126872 x1.00 +tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 1398889596 x1.00 +tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 1399704712 x1.00 +tanf_neon [-0.79, 0.79] 500000 1.91e-06 3.62e-04% 6.66e-07 1400612899 x1.00 +asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1401838993 x1.00 +asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 1402745512 x1.00 +asinf_neon [-1.00, 1.00] 500000 4.66e-05 8.90e-03% nan 1403967661 x1.00 +acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1405317842 x1.00 +acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 1406294753 x1.00 +acosf_neon [-1.00, 1.00] 500000 4.67e-05 6.35e-03% nan 1407598039 x1.00 +atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1408314869 x1.00 +atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1408872421 x1.00 +atanf_neon [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1409736652 x1.00 +sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1411101066 x1.00 +sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 1412173492 x1.00 +sinhf_neon [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.90e-07 1413205410 x1.00 +coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1414417802 x1.00 +coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 1415426083 x1.00 +coshf_neon [-3.14, 3.14] 500000 1.91e-06 2.22e-05% 1.68e-07 1416412636 x1.00 +tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1417684273 x1.00 +tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 1418659628 x1.00 +tanhf_neon [-3.14, 3.14] 500000 2.38e-07 2.47e-01% 5.40e-08 1419650721 x1.00 +expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1420706074 x1.00 +expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 1421444150 x1.00 +expf_neon [0.00, 10.00] 500000 9.77e-03 6.58e-05% 1.64e-03 1422203499 x1.00 +logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1423106698 x1.00 +logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 1423735174 x1.00 +logf_neon [1.00, 1000.00] 500000 7.63e-06 1.03e-02% 1.07e-06 1424434406 x1.00 +log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1425516892 x1.00 +log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 1426200368 x1.00 +log10f_neon [1.00, 1000.00] 500000 3.34e-06 6.68e-03% 4.84e-07 1426966844 x1.00 +floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1429081993 x1.00 +floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1430839273 x1.00 +floorf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1433474766 x1.00 +ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1435602956 x1.00 +ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1437403711 x1.00 +ceilf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1440044970 x1.00 +fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1441265630 x1.00 +fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1442491716 x1.00 +fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1443680744 x1.00 +sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1444844144 x1.00 +sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 1445710342 x1.00 +sqrtf_neon [1.00, 1000.00] 500000 7.63e-06 2.91e-05% 1.60e-06 1446544637 x1.00 +invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1446995307 x1.00 +invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 1447471977 x1.00 +invsqrtf_neon [1.00, 1000.00] 500000 1.19e-07 2.12e-05% 4.81e-09 1447987675 x1.00 +atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1449713108 x1.00 +atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 1451276575 x1.00 +atan2f_neon [0.10, 10.00] 10000 1.67e-04 2.12e-02% 0.00e+00 1453093260 x1.00 +powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1458606663 x1.00 +powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 1461584933 x1.00 +powf_neon [1.00, 10.00] 10000 1.36e+05 5.88e-03% 0.00e+00 1464702743 x1.00 +fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1466022029 x1.00 +fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 1467403015 x1.00 +fmodf_neon [1.00, 10.00] 10000 9.97e+00 8.06e-02% 0.00e+00 1468767755 x1.00 diff --git a/math_debug.c b/math_debug.c new file mode 100644 index 0000000000..a5125a3a25 --- /dev/null +++ b/math_debug.c @@ -0,0 +1,689 @@ +/* +Math-NEON: Neon Optimised Math Library based on cmath +Contact: lachlan.ts@gmail.com +Copyright (C) 2009 Lachlan Tychsen - Smith aka Adventus + +This library is free software; you can redistribute it and/or +modify it under the terms of the GNU Lesser General Public +License as published by the Free Software Foundation; either +version 3 of the License, or (at your option) any later version. + +This library is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +Lesser General Public License for more details. + +You should have received a copy of the GNU Lesser General Public +License along with this library; if not, write to the Free +Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ + + +#include +#include +#include +#include +#include +#include +#include +#ifdef WIN32 +#include +#else +#include +#include +#endif + +#define randf() (rand() / (RAND_MAX + 1.0f)) + +void LOG(const char *format, ...) { + __gnuc_va_list arg; + va_start(arg, format); + char msg[512]; + vsprintf(msg, format, arg); + va_end(arg); + + FILE *log = fopen("ux0:/data/mathneon.log", "a+"); + if (log != NULL) { + fwrite(msg, 1, strlen(msg), log); + fclose(log); + } +} + +struct test1_s { + const char* name; + float (*func)(float); //the function + float (*bench)(float); //the function to benchmark against. + float rng0, rng1; + int num; + float emaxabs; + float xmaxabs; + float emaxrel; + float xmaxrel; + float erms; + int time; //time to execute num functions; +}; + +struct test2_s { + const char* name; + float (*func)(float, float); //the function + float (*bench)(float, float); //the function to benchmark against. + float rng0, rng1; + int num; + float emaxabs; + float xmaxabs; + float emaxrel; + float xmaxrel; + float erms; + int time; //time to execute num functions; +}; + + +float invsqrtf(float x){ + return (1.0f / sqrtf(x)); +} + +typedef struct test1_s test1_t; +typedef struct test2_s test2_t; + +test1_t test1[51] = +{ + {"sinf ", sinf, sinf, -M_PI, M_PI, 500000}, + {"sinf_c ", sinf_c, sinf, -M_PI, M_PI, 500000}, + {"sinf_neon ", sinf_neon, sinf, -M_PI, M_PI, 500000}, + + {"cosf ", cosf, cosf, -M_PI, M_PI, 500000}, + {"cosf_c ", cosf_c, cosf, -M_PI, M_PI, 500000}, + {"cosf_neon ", cosf_neon, cosf, -M_PI, M_PI, 500000}, + + {"tanf ", tanf, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, + {"tanf_c ", tanf_c, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, + {"tanf_neon ", tanf_neon, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0}, + + {"asinf ", asinf, asinf, -1, 1, 500000, 0, 0, 0}, + {"asinf_c ", asinf_c, asinf, -1, 1, 500000, 0, 0, 0}, + {"asinf_neon ", asinf_neon, asinf, -1, 1, 500000, 0, 0, 0}, + + {"acosf ", acosf, acosf, -1, 1, 500000, 0, 0, 0}, + {"acosf_c ", acosf_c, acosf, -1, 1, 500000, 0, 0, 0}, + {"acosf_neon ", acosf_neon, acosf, -1, 1, 500000, 0, 0, 0}, + + {"atanf ", atanf, atanf, -1, 1, 500000, 0, 0, 0}, + {"atanf_c ", atanf_c, atanf, -1, 1, 500000, 0, 0, 0}, + {"atanf_neon ", atanf_neon, atanf, -1, 1, 500000, 0, 0, 0}, + + {"sinhf ", sinhf, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"sinhf_c ", sinhf_c, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"sinhf_neon ", sinhf_neon, sinhf, -M_PI, M_PI, 500000, 0, 0, 0}, + + {"coshf ", coshf, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"coshf_c ", coshf_c, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"coshf_neon ", coshf_neon, coshf, -M_PI, M_PI, 500000, 0, 0, 0}, + + {"tanhf ", tanhf, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"tanhf_c ", tanhf_c, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, + {"tanhf_neon ", tanhf_neon, tanhf, -M_PI, M_PI, 500000, 0, 0, 0}, + + {"expf ", expf, expf, 0, 10, 500000, 0, 0, 0}, + {"expf_c ", expf_c, expf, 0, 10, 500000, 0, 0, 0}, + {"expf_neon ", expf_neon, expf, 0, 10, 500000, 0, 0, 0}, + + {"logf ", logf, logf, 1, 1000, 500000, 0, 0, 0}, + {"logf_c ", logf_c, logf, 1, 1000, 500000, 0, 0, 0}, + {"logf_neon ", logf_neon, logf, 1, 1000, 500000, 0, 0, 0}, + + {"log10f ", log10f, log10f, 1, 1000, 500000, 0, 0, 0}, + {"log10f_c ", log10f_c, log10f, 1, 1000, 500000, 0, 0, 0}, + {"log10f_neon ", log10f_neon,log10f, 1, 1000, 500000, 0, 0, 0}, + + {"floorf ", floorf, floorf, 1, 1000, 5000000, 0, 0, 0}, + {"floorf_c ", floorf_c, floorf, 1, 1000, 5000000, 0, 0, 0}, + {"floorf_neon", floorf_neon,floorf, 1, 1000, 5000000, 0, 0, 0}, + + {"ceilf ", ceilf, ceilf, 1, 1000, 5000000, 0, 0, 0}, + {"ceilf_c ", ceilf_c, ceilf, 1, 1000, 5000000, 0, 0, 0}, + {"ceilf_neon", ceilf_neon, ceilf, 1, 1000, 5000000, 0, 0, 0}, + + {"fabsf ", fabsf, fabsf, 1, 1000, 5000000, 0, 0, 0}, + {"fabsf_c ", fabsf_c, fabsf, 1, 1000, 5000000, 0, 0, 0}, + {"fabsf_neon", fabsf_neon, fabsf, 1, 1000, 5000000, 0, 0, 0}, + + {"sqrtf ", sqrtf, sqrtf, 1, 1000, 500000, 0, 0, 0}, + {"sqrtf_c ", sqrtf_c, sqrtf, 1, 1000, 500000, 0, 0, 0}, + {"sqrtf_neon ", sqrtf_neon, sqrtf, 1, 1000, 500000, 0, 0, 0}, + + {"invsqrtf ", invsqrtf, invsqrtf, 1, 1000, 500000, 0, 0, 0}, + {"invsqrtf_c ", invsqrtf_c, invsqrtf, 1, 1000, 500000, 0, 0, 0}, + {"invsqrtf_neon ", invsqrtf_neon, invsqrtf, 1, 1000, 500000, 0, 0, 0}, +}; + +test2_t test2[9] = +{ + {"atan2f ", atan2f, atan2f, 0.1, 10, 10000, 0, 0, 0}, + {"atan2f_c ", atan2f_c, atan2f, 0.1, 10, 10000, 0, 0, 0}, + {"atan2f_neon ", atan2f_neon,atan2f, 0.1, 10, 10000, 0, 0, 0}, + + {"powf ", powf, powf, 1, 10, 10000, 0, 0, 0}, + {"powf_c ", powf_c, powf, 1, 10, 10000, 0, 0, 0}, + {"powf_neon ", powf_neon, powf, 1, 10, 10000, 0, 0, 0}, + + {"fmodf ", fmodf, fmodf, 1, 10, 10000, 0, 0, 0}, + {"fmodf_c ", fmodf_c, fmodf, 1, 10, 10000, 0, 0, 0}, + {"fmodf_neon ", fmodf_neon, fmodf, 1, 10, 10000, 0, 0, 0}, + +}; + + +void +test_mathfunc1(test1_t *tst) +{ + + float x; + float dx = (tst->rng1 - tst->rng0) / ((float)tst->num); +#ifndef WIN32 + struct rusage ru; +#endif + + tst->emaxabs = tst->xmaxabs = 0; + tst->emaxrel = tst->xmaxrel = 0; + tst->erms = 0; + for(x = tst->rng0; x < tst->rng1 ; x += dx){ + float r = (tst->func)((float)x); + float rr = (tst->bench)((float)x); + float dr = fabs(r - rr); + float drr = dr * (100.0f / rr); + tst->erms += dr*dr; + if (dr > tst->emaxabs){ + tst->emaxabs = dr; + tst->xmaxabs = x; + } + if (drr > tst->emaxrel){ + tst->emaxrel = drr; + tst->xmaxrel = x; + } + } + tst->erms = sqrt(tst->erms / ((float) tst->num)); + +#ifdef WIN32 + tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000); +#else + tst->time = sceKernelGetSystemTimeWide(); +#endif + + for(x = tst->rng0; x < tst->rng1 ; x += dx){ + (tst->func)((float)x); + } + +#ifdef WIN32 + tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time; +#else + tst->time = sceKernelGetSystemTimeWide(); +#endif + +} + +void +test_mathfunc2(test2_t *tst) +{ + float x, y; + float rng = tst->rng1 - tst->rng0; + float d = (rng * rng) / ((float) tst->num); +#ifndef WIN32 + struct rusage ru; +#endif + + tst->emaxabs = tst->xmaxabs = 0; + tst->emaxrel = tst->xmaxrel = 0; + for(y = (tst->rng0); y < (tst->rng1) ; y += d){ + for(x = (tst->rng0); x < (tst->rng1); x += d){ + float r = (tst->func)((float)x, y); + float rr = (tst->bench)((float)x, y); + float dr = fabs(r - rr); + float drr = dr * (100.0f / rr); + if (dr > tst->emaxabs){ + tst->emaxabs = dr; + tst->xmaxabs = x; + } + if (drr > tst->emaxrel && fabsf(rr) > 0.0001){ + tst->emaxrel = drr; + tst->xmaxrel = x; + } + } + } + +#ifdef WIN32 + tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ; +#else + tst->time = sceKernelGetSystemTimeWide(); +#endif + + for(y = tst->rng0; y < tst->rng1 ; y += d){ + for(x = tst->rng0; x < tst->rng1 ; x += d){ + (tst->func)((float)x, (float)y); + } + } + +#ifdef WIN32 + tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time; +#else + tst->time = sceKernelGetSystemTimeWide(); +#endif + +} + +void test_vectorfunc() +{ + float v0[4], v1[4], d[4]; + + for(int i=0;i<4;i++) + { + v0[i] = 10*randf() - 5; + v1[i] = 10*randf() - 5; + d[i] = 10*randf() - 5; + } + + int testnum = 5000000; + struct rusage ru; + int v2t[3], v3t[3], v4t[3]; + float r; + + LOG("\n"); + + //dot 2 + v2t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot2_c(v0, v1); + }; + v2t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot2_neon(v0, v1); + }; + v2t[2] = sceKernelGetSystemTimeWide(); + + r = dot2_c(v0, v1); + LOG("dot2_c = %f\n", r); + r = dot2_neon(v0, v1); + LOG("dot2_neon = %f\n", r); + + LOG("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], + (float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1])); + + //normalize 2 + v2t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize2_c(v0, d); + }; + v2t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize2_neon(v0, d); + }; + v2t[2] = sceKernelGetSystemTimeWide(); + + + normalize2_c(v0, d); + LOG("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]); + normalize2_neon(v0, d); + LOG("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]); + + LOG("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1], + (float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1])); + LOG("\n"); + + + //dot 3 + v3t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot3_c(v0, v1); + }; + v3t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot3_neon(v0, v1); + }; + v3t[2] = sceKernelGetSystemTimeWide(); + + r = dot3_c(v0, v1); + LOG("dot3_c = %f\n", r); + r = dot3_neon(v0, v1); + LOG("dot3_neon = %f\n", r); + + LOG("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], + (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); + + //normalize 3 + v3t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize3_c(v0, d); + }; + v3t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize3_neon(v0, d); + }; + v3t[2] = sceKernelGetSystemTimeWide(); + + + normalize3_c(v0, d); + LOG("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); + normalize3_neon(v0, d); + LOG("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); + + LOG("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], + (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); + + //cross 3 + v3t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + cross3_c(v0, v1, d); + }; + v3t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + cross3_neon(v0, v1, d); + }; + v3t[2] = sceKernelGetSystemTimeWide(); + + + cross3_c(v0, v1, d); + LOG("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); + cross3_neon(v0, v1, d); + LOG("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]); + + LOG("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1], + (float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1])); + LOG("\n"); + + + //dot 4 + v4t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot4_c(v0, v1); + }; + v4t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + r = dot4_neon(v0, v1); + }; + v4t[2] = sceKernelGetSystemTimeWide(); + + r = dot4_c(v0, v1); + LOG("dot4_c = %f\n", r); + r = dot4_neon(v0, v1); + LOG("dot4_neon = %f\n", r); + + LOG("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], + (float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1])); + + //normalize 4 + v4t[0] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize4_c(v0, d); + }; + v4t[1] = sceKernelGetSystemTimeWide(); + for(int i=0;i < testnum; i++) + { + normalize4_neon(v0, d); + }; + v4t[2] = sceKernelGetSystemTimeWide(); + + + normalize4_c(v0, d); + LOG("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]); + normalize4_neon(v0, d); + LOG("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]); + + LOG("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1], + (float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1])); + LOG("\n"); + + +} + + + +void test_matrixfunc() +{ + float m0[16], m1[16], m2[16]; + int m2t[3], m3t[3], m4t[3]; + + int i; + int testnum = 1000000; + struct rusage ru; + + for(int i=0;i<16;i++) + { + m0[i] = 10.0f * randf() - 5.0f; + m1[i] = 10.0f * randf() - 5.0f; + m2[i] = 10.0f * randf() - 5.0f; + } + + + //matmul2 + m2t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul2_c(m0, m1, m2); + } + m2t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul2_neon(m0, m1, m2); + } + m2t[2] = sceKernelGetSystemTimeWide(); + + matmul2_c(m0, m1, m2); + LOG("matmul2_c = \n"); + LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]); + LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]); + + matmul2_neon(m0, m1, m2); + LOG("matmul2_neon = \n"); + LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]); + LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]); + + LOG("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], + (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); + + + //matvec2 + m2t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec2_c(m0, m1, m2); + } + m2t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec2_neon(m0, m1, m2); + } + m2t[2] = sceKernelGetSystemTimeWide(); + + memset(m2, 0, 4*sizeof(float)); + matvec2_c(m0, m1, m2); + LOG("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]); + + memset(m2, 0, 4*sizeof(float)); + matvec2_neon(m0, m1, m2); + LOG("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]); + + LOG("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1], + (float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1])); + + //MAT3 + m3t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul3_c(m0, m1, m2); + } + m3t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul3_neon(m0, m1, m2); + } + m3t[2] = sceKernelGetSystemTimeWide(); + + memset(m2, 0, 9*sizeof(float)); + matmul3_c(m0, m1, m2); + LOG("matmul3_c =\n"); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]); + + memset(m2, 0, 9*sizeof(float)); + matmul3_neon(m0, m1, m2); + LOG("matmul3_neon =\n"); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]); + LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]); + + LOG("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], + (float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1])); + + //matvec3 + m3t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec3_c(m0, m1, m2); + } + m3t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec3_neon(m0, m1, m2); + } + m3t[2] = sceKernelGetSystemTimeWide(); + + memset(m2, 0, 4*sizeof(float)); + matvec3_c(m0, m1, m2); + LOG("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]); + + memset(m2, 0, 4*sizeof(float)); + matvec3_neon(m0, m1, m2); + LOG("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]); + + LOG("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1], + (float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1])); + + //MAT4 + m4t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul4_c(m0, m1, m2); + } + m4t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matmul4_neon(m0, m1, m2); + } + m4t[2] = sceKernelGetSystemTimeWide(); + + memset(m2, 0, 16*sizeof(float)); + matmul4_c(m0, m1, m2); + LOG("matmul4_c =\n"); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]); + + memset(m2, 0, 16*sizeof(float)); + matmul4_neon(m0, m1, m2); + LOG("matmul4_neon =\n"); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]); + LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]); + + LOG("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], + (float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1])); + + //matvec4 + m4t[0] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec4_c(m0, m1, m2); + } + m4t[1] = sceKernelGetSystemTimeWide(); + for(i = 0; i < testnum; i++){ + matvec4_neon(m0, m1, m2); + } + m4t[2] = sceKernelGetSystemTimeWide(); + + memset(m2, 0, 4*sizeof(float)); + matvec4_c(m0, m1, m2); + LOG("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]); + + memset(m2, 0, 4*sizeof(float)); + matvec4_neon(m0, m1, m2); + LOG("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]); + + LOG("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1], + (float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1])); + + +} + +int main(int argc, char** argv) +{ + + int i, ii; +#if 1 + LOG("RUNFAST: Disabled \n"); +#else + LOG("RUNFAST: Enabled \n"); + enable_runfast(); +#endif + srand(time(NULL)); + +#if 1 + //test single argument functions: + LOG("------------------------------------------------------------------------------------------------------\n"); + LOG("MATRIX FUNCTION TESTS \n"); + LOG("------------------------------------------------------------------------------------------------------\n"); + + test_matrixfunc(); + test_vectorfunc(); + + LOG("------------------------------------------------------------------------------------------------------\n"); + LOG("CMATH FUNCTION TESTS \n"); + LOG("------------------------------------------------------------------------------------------------------\n"); + LOG("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n"); + LOG("------------------------------------------------------------------------------------------------------\n"); + for(i = 0; i < 51; i++){ + test_mathfunc1(&test1[i]); + + ii = i - (i % 3); + LOG("%s\t", test1[i].name); + LOG("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1); + LOG("%i\t", test1[i].num); + LOG("%.2e\t", test1[i].emaxabs); + LOG("%.2e%%\t", test1[i].emaxrel); + LOG("%.2e\t", test1[i].erms); + LOG("%i\t", test1[i].time); + LOG("x%.2f\t", (float)test1[ii].time / test1[i].time); + LOG("\n"); + } + for(i = 0; i < 9; i++){ + test_mathfunc2(&test2[i]); + + ii = i - (i % 3); + + LOG("%s\t", test2[i].name); + LOG("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1); + LOG("%i\t", test2[i].num); + LOG("%.2e\t", test2[i].emaxabs); + LOG("%.2e%%\t", test2[i].emaxrel); + LOG("%.2e\t", test2[i].erms); + LOG("%i\t", test2[i].time); + LOG("x%.2f\t", (float)test2[ii].time / test2[i].time); + LOG("\n"); + } + +#else + + + float x = 0; + for(x = -M_PI_2; x < M_PI_2; x+= 0.01) + { + LOG("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x)); + } + +#endif + + return 0; +} diff --git a/source/math_acosf.c b/source/math_acosf.c new file mode 100644 index 0000000000..59a22b2985 --- /dev/null +++ b/source/math_acosf.c @@ -0,0 +1,67 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +/* +Test func : acosf(x) +Test Range: -1.0 < x < 1.0 +Peak Error: ~0.005% +RMS Error: ~0.001% +*/ + +const float __acosf_pi_2 = M_PI_2; + +float acosf_c(float x) +{ + return __acosf_pi_2 - asinf_c(x); +} + + +float acosf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asinf_neon_hfp(x); + asm volatile ( + "vdup.f32 d1, %0 \n\t" //d1 = {pi/2, pi/2}; + "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; + ::"r"(__acosf_pi_2): + ); +#endif +} + +float acosf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + acosf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return acosf_c(x); +#endif +} + + + diff --git a/source/math_asinf.c b/source/math_asinf.c new file mode 100644 index 0000000000..0ae8ef9b84 --- /dev/null +++ b/source/math_asinf.c @@ -0,0 +1,183 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +/* +Test func : asinf(x) +Test Range: -1.0 < x < 1.0 +Peak Error: ~0.005% +RMS Error: ~0.001% +*/ + + +const float __asinf_lut[4] = { + 0.105312459675071, //p7 + 0.169303418571894, //p3 + 0.051599985887214, //p5 + 0.999954835104825 //p1 +}; + +const float __asinf_pi_2 = M_PI_2; + +float asinf_c(float x) +{ + + float a, b, c, d, r, ax; + int m; + + union { + float f; + int i; + } xx; + + ax = fabs(x); + d = 0.5; + d = d - ax*0.5; + + //fast invsqrt approx + xx.f = d; + xx.i = 0x5F3759DF - (xx.i >> 1); //VRSQRTE + c = d * xx.f; + b = (3.0f - c * xx.f) * 0.5; //VRSQRTS + xx.f = xx.f * b; + c = d * xx.f; + b = (3.0f - c * xx.f) * 0.5; + xx.f = xx.f * b; + + //fast inverse approx + d = xx.f; + m = 0x3F800000 - (xx.i & 0x7F800000); + xx.i = xx.i + m; + xx.f = 1.41176471f - 0.47058824f * xx.f; + xx.i = xx.i + m; + b = 2.0 - xx.f * d; + xx.f = xx.f * b; + b = 2.0 - xx.f * d; + xx.f = xx.f * b; + + //if |x|>0.5 -> x = sqrt((1-x)/2) + xx.f = xx.f - ax; + a = (ax > 0.5f); + d = __asinf_pi_2 * a; + c = 1.0f - 3.0f * a; + ax = ax + xx.f * a; + + //polynomial evaluation + xx.f = ax * ax; + a = (__asinf_lut[0] * ax) * xx.f + (__asinf_lut[2] * ax); + b = (__asinf_lut[1] * ax) * xx.f + (__asinf_lut[3] * ax); + xx.f = xx.f * xx.f; + r = b + a * xx.f; + r = d + c * r; + + a = r + r; + b = (x < 0.0f); + r = r - a * b; + return r; +} + + +float asinf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; + "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; + "vmov.f32 d6, d0 \n\t" //d6 = d0; + "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; + + "vmov.f32 d5, #0.5 \n\t" //d5 = 0.5; + "vmls.f32 d5, d0, d5 \n\t" //d5 = d5 - d0*d5; + + //fast invsqrt approx + "vmov.f32 d1, d5 \n\t" //d1 = d5 + "vrsqrte.f32 d5, d5 \n\t" //d5 = ~ 1.0 / sqrt(d5) + "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 + "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d2) / 2 + "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 + "vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1 + "vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d3) / 2 + "vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3 + + //fast reciporical approximation + "vrecpe.f32 d1, d5 \n\t" //d1 = ~ 1 / d5; + "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5; + "vmul.f32 d5, d1, d2 \n\t" //d5 = d1 * d2; + + //if |x| > 0.5 -> ax = sqrt((1-ax)/2), r = pi/2 + "vsub.f32 d5, d0, d5 \n\t" //d5 = d0 - d5; + "vmov.f32 d2, #0.5 \n\t" //d2 = 0.5; + "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); + "vmov.f32 d1, #3.0 \n\t" //d5 = 3.0; + "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; + "vmov.f32 d16, #1.0 \n\t" //d16 = 1.0; + "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; + "vmls.f32 d0, d5, d3[0] \n\t" //d0 = d0 - d5 * d3[0]; + "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; + "vmls.f32 d16, d1, d3[0] \n\t" //d16 = d16 - d1 * d3; + + //polynomial: + "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} + "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} + "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} + "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} + "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} + "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} + + "vmla.f32 d7, d1, d16 \n\t" //d7 = d7 + d1*d16 + + "vadd.f32 d2, d7, d7 \n\t" //d2 = d7 + d7 + "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) + "vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31; + "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 + "vmls.f32 d7, d2, d3[0] \n\t" //d7 = d7 - d2 * d3[0]; + + "vmov.f32 s0, s15 \n\t" //s0 = s3 + + :: "r"(__asinf_lut), "r"(__asinf_pi_2) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); +#endif +} + + +float asinf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + asinf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return asinf_c(x); +#endif +} + + + + diff --git a/source/math_atan2f.c b/source/math_atan2f.c new file mode 100644 index 0000000000..d076a04c04 --- /dev/null +++ b/source/math_atan2f.c @@ -0,0 +1,170 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __atan2f_lut[4] = { + -0.0443265554792128, //p7 + -0.3258083974640975, //p3 + +0.1555786518463281, //p5 + +0.9997878412794807 //p1 +}; + +const float __atan2f_pi_2 = M_PI_2; + +float atan2f_c(float y, float x) +{ + float a, b, c, r, xx; + int m; + union { + float f; + int i; + } xinv; + + //fast inverse approximation (2x newton) + xx = fabs(x); + xinv.f = xx; + m = 0x3F800000 - (xinv.i & 0x7F800000); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f - 0.47058824f * xinv.f; + xinv.i = xinv.i + m; + b = 2.0 - xinv.f * xx; + xinv.f = xinv.f * b; + b = 2.0 - xinv.f * xx; + xinv.f = xinv.f * b; + + c = fabs(y * xinv.f); + + //fast inverse approximation (2x newton) + xinv.f = c; + m = 0x3F800000 - (xinv.i & 0x7F800000); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f - 0.47058824f * xinv.f; + xinv.i = xinv.i + m; + b = 2.0 - xinv.f * c; + xinv.f = xinv.f * b; + b = 2.0 - xinv.f * c; + xinv.f = xinv.f * b; + + //if |x| > 1.0 -> ax = -1/ax, r = pi/2 + xinv.f = xinv.f + c; + a = (c > 1.0f); + c = c - a * xinv.f; + r = a * __atan2f_pi_2; + + //polynomial evaluation + xx = c * c; + a = (__atan2f_lut[0] * c) * xx + (__atan2f_lut[2] * c); + b = (__atan2f_lut[1] * c) * xx + (__atan2f_lut[3] * c); + xx = xx * xx; + r = r + a * xx; + r = r + b; + + //determine quadrant and test for small x. + b = M_PI; + b = b - 2.0f * r; + r = r + (x < 0.0f) * b; + b = (fabs(x) < 0.000001f); + c = !b; + r = c * r; + r = r + __atan2f_pi_2 * b; + b = r + r; + r = r - (y < 0.0f) * b; + + return r; +} + +float atan2f_neon_hfp(float y, float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d17, d0[1] \n\t" //d17 = {x, x}; + "vdup.f32 d16, d0[0] \n\t" //d16 = {y, y}; + + //1.0 / x + "vrecpe.f32 d18, d17 \n\t" //d16 = ~ 1 / d1; + "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; + "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; + "vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1; + "vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17; + + //y * (1.0 /x) + "vmul.f32 d0, d16, d18 \n\t" //d0 = d16 * d18; + + + "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; + "vmov.f32 d6, d0 \n\t" //d6 = d0; + "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; + + //fast reciporical approximation + "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + + //if |x| > 1.0 -> ax = 1/ax, r = pi/2 + "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; + "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; + "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); + "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3; + "vmls.f32 d0, d1, d3 \n\t" //d0 = d0 - d1 * d3; + "vmul.f32 d7, d3, d4 \n\t" //d7 = d3 * d4; + + //polynomial: + "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} + "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} + "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} + "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} + "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} + "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} + "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 + + "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 + "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) + "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 + "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; + + "vmov.f32 s0, s3 \n\t" //s0 = s3 + + :: "r"(__atan2f_lut), "r"(__atan2f_pi_2) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); +#endif +} + + +float atan2f_neon_sfp(float x, float y) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + asm volatile ("vmov.f32 s1, r1 \n\t"); + atan2f_neon_hfp(x, y); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return atan2f_c(y, x); +#endif +}; diff --git a/source/math_atanf.c b/source/math_atanf.c new file mode 100644 index 0000000000..c983756dd2 --- /dev/null +++ b/source/math_atanf.c @@ -0,0 +1,149 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __atanf_lut[4] = { + -0.0443265554792128, //p7 + -0.3258083974640975, //p3 + +0.1555786518463281, //p5 + +0.9997878412794807 //p1 +}; + +const float __atanf_pi_2 = M_PI_2; + +float atanf_c(float x) +{ + + float a, b, r, xx; + int m; + + union { + float f; + int i; + } xinv, ax; + + ax.f = fabs(x); + + //fast inverse approximation (2x newton) + xinv.f = ax.f; + m = 0x3F800000 - (xinv.i & 0x7F800000); + xinv.i = xinv.i + m; + xinv.f = 1.41176471f - 0.47058824f * xinv.f; + xinv.i = xinv.i + m; + b = 2.0 - xinv.f * ax.f; + xinv.f = xinv.f * b; + b = 2.0 - xinv.f * ax.f; + xinv.f = xinv.f * b; + + //if |x| > 1.0 -> ax = -1/ax, r = pi/2 + xinv.f = xinv.f + ax.f; + a = (ax.f > 1.0f); + ax.f = ax.f - a * xinv.f; + r = a * __atanf_pi_2; + + //polynomial evaluation + xx = ax.f * ax.f; + a = (__atanf_lut[0] * ax.f) * xx + (__atanf_lut[2] * ax.f); + b = (__atanf_lut[1] * ax.f) * xx + (__atanf_lut[3] * ax.f); + xx = xx * xx; + b = b + a * xx; + r = r + b; + + //if x < 0 -> r = -r + a = 2 * r; + b = (x < 0.0f); + r = r - a * b; + + return r; +} + + +float atanf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}; + + "vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2}; + "vmov.f32 d6, d0 \n\t" //d6 = d0; + "vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ; + + //fast reciporical approximation + "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + + + //if |x| > 1.0 -> ax = -1/ax, r = pi/2 + "vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0; + "vmov.f32 d2, #1.0 \n\t" //d2 = 1.0; + "vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2); + "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); + "vcvt.f32.u32 d3, d3 \n\t" //d5 = (float) d3; + "vmls.f32 d0, d1, d3[0] \n\t" //d0 = d0 - d1 * d3[0]; + "vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4; + + //polynomial: + "vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2} + "vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} + "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} + "vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x} + "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3} + "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7} + "vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7 + + "vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1 + "vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0) + "vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2); + "vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3 + "vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3; + + "vmov.f32 s0, s3 \n\t" //s0 = s3 + + :: "r"(__atanf_lut), "r"(__atanf_pi_2) + : "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7" + ); + +#endif +} + + +float atanf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vdup.f32 d0, r0 \n\t"); + atanf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return atanf_c(x); +#endif +}; + + + diff --git a/source/math_ceilf.c b/source/math_ceilf.c new file mode 100644 index 0000000000..1432efee73 --- /dev/null +++ b/source/math_ceilf.c @@ -0,0 +1,71 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Assumes the floating point value |x| < 2147483648 +*/ + +#include "math.h" +#include "math_neon.h" + +float ceilf_c(float x) +{ + int n; + float r; + n = (int) x; + r = (float) n; + r = r + (x > r); + return r; +} + +float ceilf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; + "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; + "vcgt.f32 d0, d0, d1 \n\t" //d0 = (d0 > d1); + "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; + "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; + "vadd.f32 d0, d1, d0 \n\t" //d0 = d1 + d0; + + ::: "d0", "d1" + ); + +#endif +} + +float ceilf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + ceilf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return ceilf_c(x); +#endif +}; + + diff --git a/source/math_cosf.c b/source/math_cosf.c new file mode 100644 index 0000000000..cb14498069 --- /dev/null +++ b/source/math_cosf.c @@ -0,0 +1,50 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math_neon.h" + +float cosf_c(float x) +{ + return sinf_c(x + M_PI_2); +} + +float cosf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + float xx = x + M_PI_2; + return sinf_neon_hfp(xx); +#endif +} + +float cosf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vdup.f32 d0, r0 \n\t"); + cosf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return cosf_c(x); +#endif +}; + diff --git a/source/math_coshf.c b/source/math_coshf.c new file mode 100644 index 0000000000..a779b6a7be --- /dev/null +++ b/source/math_coshf.c @@ -0,0 +1,120 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __coshf_rng[2] = { + 1.442695041f, + 0.693147180f +}; + +const float __coshf_lut[16] = { + 0.00019578093328483123, //p7 + 0.00019578093328483123, //p7 + 0.0014122663401803872, //p6 + 0.0014122663401803872, //p6 + 0.008336936973260111, //p5 + 0.008336936973260111, //p5 + 0.04165989275009526, //p4 + 0.04165989275009526, //p4 + 0.16666570253074878, //p3 + 0.16666570253074878, //p3 + 0.5000006143673624, //p2 + 0.5000006143673624, //p2 + 1.000000059694879, //p1 + 1.000000059694879, //p1 + 0.9999999916728642, //p0 + 0.9999999916728642 //p0 +}; + + +float coshf_c(float x) +{ + float a, b, xx; + xx = -x; + a = expf_c(x); + b = expf_c(xx); + a = a * 0.5f; + a = a + 0.5f * b; + return a; +} + + +float coshf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} + "fnegs s1, s1 \n\t" //s1 = -s1 + + //Range Reduction: + "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} + "vld1.32 {d16, d17}, [%1]! \n\t" + "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] + "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 + "vld1.32 {d18}, [%1]! \n\t" + "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 + "vld1.32 {d19}, [%1]! \n\t" + "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] + "vld1.32 {d20}, [%1]! \n\t" + + //polynomial: + "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; + "vld1.32 {d21}, [%1]! \n\t" + "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; + "vld1.32 {d22}, [%1]! \n\t" + "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; + "vld1.32 {d23}, [%1]! \n\t" + "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; + "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; + "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; + "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; + + //multiply by 2 ^ m + "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 + "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 + + "vdup.f32 d2, d0[1] \n\t" //d2 = s1 + "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 + "vadd.f32 d0, d0, d2 \n\t" //d0 = d0 + d2 + "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 + + :: "r"(__coshf_rng), "r"(__coshf_lut) + : "d0", "d1", "q1", "q2", "d6" + ); + +#endif +} + +float coshf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + coshf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return coshf_c(x); +#endif +}; diff --git a/source/math_expf.c b/source/math_expf.c new file mode 100644 index 0000000000..011b9495bd --- /dev/null +++ b/source/math_expf.c @@ -0,0 +1,135 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Based on: + + e ^ x = (1+m) * (2^n) + x = log(1+m) + n * log(2) + n = (int) (x * 1.0 / log(2)) + (1+m) = e ^ (x - n * log(2)) + (1+m) = Poly(x - n * log(2)) + + where Poly(x) is the Minimax approximation of e ^ x over the + range [-Log(2), Log(2)] + +Test func : expf(x) +Test Range: 0 < x < 50 +Peak Error: ~0.00024% +RMS Error: ~0.00007% +*/ + +#include "math.h" +#include "math_neon.h" + +const float __expf_rng[2] = { + 1.442695041f, + 0.693147180f +}; + +const float __expf_lut[8] = { + 0.9999999916728642, //p0 + 0.04165989275009526, //p4 + 0.5000006143673624, //p2 + 0.0014122663401803872, //p6 + 1.000000059694879, //p1 + 0.008336936973260111, //p5 + 0.16666570253074878, //p3 + 0.00019578093328483123 //p7 +}; + +float expf_c(float x) +{ + float a, b, c, d, xx; + int m; + + union { + float f; + int i; + } r; + + //Range Reduction: + m = (int) (x * __expf_rng[0]); + x = x - ((float) m) * __expf_rng[1]; + + //Taylor Polynomial (Estrins) + a = (__expf_lut[4] * x) + (__expf_lut[0]); + b = (__expf_lut[6] * x) + (__expf_lut[2]); + c = (__expf_lut[5] * x) + (__expf_lut[1]); + d = (__expf_lut[7] * x) + (__expf_lut[3]); + xx = x * x; + a = a + b * xx; + c = c + d * xx; + xx = xx* xx; + r.f = a + c * xx; + + //multiply by 2 ^ m + m = m << 23; + r.i = r.i + m; + + return r.f; +} + +float expf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} + + //Range Reduction: + "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} + "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] + "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 + "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 + "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] + + //polynomial: + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} + "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; + "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] + "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] + "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} + "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] + + //multiply by 2 ^ m + "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 + "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 + + :: "r"(__expf_rng), "r"(__expf_lut) + : "d0", "d1", "q1", "q2", "d6" + ); +#endif +} + +float expf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + expf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return expf_c(x); +#endif +}; + diff --git a/source/math_fabsf.c b/source/math_fabsf.c new file mode 100644 index 0000000000..c22244704f --- /dev/null +++ b/source/math_fabsf.c @@ -0,0 +1,58 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math_neon.h" + + +float fabsf_c(float x) +{ + union { + int i; + float f; + } xx; + + xx.f = x; + xx.i = xx.i & 0x7FFFFFFF; + return xx.f; +} + +float fabsf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "fabss s0, s0 \n\t" //s0 = fabs(s0) + ); +#endif +} + +float fabsf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "bic r0, r0, #0x80000000 \n\t" //r0 = r0 & ~(1 << 31) + ); +#else + return fabsf_c(x); +#endif +} diff --git a/source/math_floorf.c b/source/math_floorf.c new file mode 100644 index 0000000000..091709140e --- /dev/null +++ b/source/math_floorf.c @@ -0,0 +1,66 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Assumes the floating point value |x| < 2147483648 +*/ + +#include "math.h" +#include "math_neon.h" + +float floorf_c(float x) +{ + int n; + float r; + n = (int) x; + r = (float) n; + r = r - (r > x); + return r; +} + +float floorf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; + "vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1; + "vcgt.f32 d0, d1, d0 \n\t" //d0 = (d1 > d0); + "vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31; + "vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0; + "vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0; + ::: "d0", "d1" + ); +#endif +} + +float floorf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + floorf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return floorf_c(x); +#endif +}; diff --git a/source/math_fmodf.c b/source/math_fmodf.c new file mode 100644 index 0000000000..86af55da34 --- /dev/null +++ b/source/math_fmodf.c @@ -0,0 +1,100 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Assumes the floating point value |x / y| < 2,147,483,648 +*/ + +#include "math_neon.h" + +float fmodf_c(float x, float y) +{ + int n; + union { + float f; + int i; + } yinv; + float a; + + //fast reciporical approximation (4x Newton) + yinv.f = y; + n = 0x3F800000 - (yinv.i & 0x7F800000); + yinv.i = yinv.i + n; + yinv.f = 1.41176471f - 0.47058824f * yinv.f; + yinv.i = yinv.i + n; + a = 2.0 - yinv.f * y; + yinv.f = yinv.f * a; + a = 2.0 - yinv.f * y; + yinv.f = yinv.f * a; + a = 2.0 - yinv.f * y; + yinv.f = yinv.f * a; + a = 2.0 - yinv.f * y; + yinv.f = yinv.f * a; + + n = (int)(x * yinv.f); + x = x - ((float)n) * y; + return x; +} + + +float fmodf_neon_hfp(float x, float y) +{ +#ifdef __MATH_NEON + asm volatile ( + "vdup.f32 d1, d0[1] \n\t" //d1[0] = y + "vdup.f32 d0, d0[0] \n\t" //d1[0] = y + + //fast reciporical approximation + "vrecpe.f32 d2, d1 \n\t" //d2 = ~1.0 / d1 + "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; + "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; + "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; + "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; + "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; + "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; + "vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1; + "vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3; + + "vmul.f32 d2, d2, d0 \n\t" //d2 = d2 * d0; + "vcvt.s32.f32 d2, d2 \n\t" //d2 = (int) d2; + "vcvt.f32.s32 d2, d2 \n\t" //d2 = (float) d2; + "vmls.f32 d0, d1, d2 \n\t" //d0 = d0 - d1 * d2; + + ::: "d0", "d1", "d2", "d3" + ); +#endif +} + + +float fmodf_neon_sfp(float x, float y) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + asm volatile ("vmov.f32 s1, r1 \n\t"); + fmodf_neon_hfp(x, y); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return fmodf_c(x,y); +#endif +}; diff --git a/source/math_invsqrtf.c b/source/math_invsqrtf.c new file mode 100644 index 0000000000..c4d2b1d52a --- /dev/null +++ b/source/math_invsqrtf.c @@ -0,0 +1,79 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +float invsqrtf_c(float x) +{ + + float b, c; + union { + float f; + int i; + } a; + + //fast invsqrt approx + a.f = x; + a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; //VRSQRTS + a.f = a.f * b; + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; + a.f = a.f * b; + + return a.f; +} + +float invsqrtf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + ::: "d0", "d1", "d2", "d3" + ); +#endif +} + +float invsqrtf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + invsqrtf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return invsqrtf_c(x); +#endif +}; + diff --git a/source/math_ldexpf.c b/source/math_ldexpf.c new file mode 100644 index 0000000000..673158958f --- /dev/null +++ b/source/math_ldexpf.c @@ -0,0 +1,67 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +float ldexpf_c(float m, int e) +{ + union { + float f; + int i; + } r; + r.f = m; + r.i += (e << 23); + return r.f; +} + +float ldexpf_neon_hfp(float m, int e) +{ +#ifdef __MATH_NEON + float r; + asm volatile ( + "lsl r0, r0, #23 \n\t" //r0 = r0 << 23 + "vdup.i32 d1, r0 \n\t" //d1 = {r0, r0} + "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 + ::: "d0", "d1" + ); +#endif +} + +float ldexpf_neon_sfp(float m, int e) +{ +#ifdef __MATH_NEON + float r; + asm volatile ( + "lsl r1, r1, #23 \n\t" //r1 = r1 << 23 + "vdup.f32 d0, r0 \n\t" //d0 = {r0, r0} + "vdup.i32 d1, r1 \n\t" //d1 = {r1, r1} + "vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1 + "vmov.f32 r0, s0 \n\t" //r0 = s0 + ::: "d0", "d1" + ); +#else + return ldexpf_c(m,e); +#endif +} diff --git a/source/math_log10f.c b/source/math_log10f.c new file mode 100644 index 0000000000..f68912f0fe --- /dev/null +++ b/source/math_log10f.c @@ -0,0 +1,135 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Based on: + + log10(x) = log10((1+m) * (2^n)) + log(x) = n * log10(2) + log10(1 + m) + log(1+m) = Poly(1+m) + + where Poly(x) is the Minimax approximation of log10(x) over the + range [1, 2] + +Test func : log10f(x) +Test Range: 1 < x < 10000 +Peak Error: ~0.000040% +RMS Error: ~0.000008% +*/ + +#include "math.h" +#include "math_neon.h" + +const float __log10f_rng = 0.3010299957f; + +const float __log10f_lut[8] = { + -0.99697286229624, //p0 + -1.07301643912502, //p4 + -2.46980061535534, //p2 + -0.07176870463131, //p6 + 2.247870219989470, //p1 + 0.366547581117400, //p5 + 1.991005185100089, //p3 + 0.006135635201050, //p7 +}; + +float log10f_c(float x) +{ + float a, b, c, d, xx; + int m; + + union { + float f; + int i; + } r; + + //extract exponent + r.f = x; + m = (r.i >> 23); + m = m - 127; + r.i = r.i - (m << 23); + + //Taylor Polynomial (Estrins) + xx = r.f * r.f; + a = (__log10f_lut[4] * r.f) + (__log10f_lut[0]); + b = (__log10f_lut[6] * r.f) + (__log10f_lut[2]); + c = (__log10f_lut[5] * r.f) + (__log10f_lut[1]); + d = (__log10f_lut[7] * r.f) + (__log10f_lut[3]); + a = a + b * xx; + c = c + d * xx; + xx = xx * xx; + r.f = a + c * xx; + + //add exponent + r.f = r.f + ((float) m) * __log10f_rng; + + return r.f; +} + +float log10f_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; + + //extract exponent + "vmov.i32 d2, #127 \n\t" //d2 = 127; + "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; + "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; + "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; + "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; + + //polynomial: + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} + "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; + "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] + "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] + "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} + "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] + + //add exponent + "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} + "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 + "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 + + "vmov.f32 s0, s4 \n\t" //s0 = s4 + + :: "r"(__log10f_rng), "r"(__log10f_lut) + : "d0", "d1", "q1", "q2", "d6", "d7" + ); +#endif +} + + +float log10f_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + log10f_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return log10f_c(x); +#endif +}; diff --git a/source/math_logf.c b/source/math_logf.c new file mode 100644 index 0000000000..61761363e5 --- /dev/null +++ b/source/math_logf.c @@ -0,0 +1,135 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Based on: + + log(x) = log((1+m) * (2^n)) + log(x) = n * log(2) + log(1 + m) + log(1+m) = Poly(1+m) + + where Poly(x) is the Minimax approximation of log(x) over the + range [1, 2] + +Test func : logf(x) +Test Range: 1 < x < 10000 +Peak Error: ~0.000601% +RMS Error: ~0.000005% +*/ + +#include "math.h" +#include "math_neon.h" + +const float __logf_rng = 0.693147180f; + +const float __logf_lut[8] = { + -2.295614848256274, //p0 + -2.470711633419806, //p4 + -5.686926051100417, //p2 + -0.165253547131978, //p6 + +5.175912446351073, //p1 + +0.844006986174912, //p5 + +4.584458825456749, //p3 + +0.014127821926000 //p7 +}; + +float logf_c(float x) +{ + float a, b, c, d, xx; + int m; + + union { + float f; + int i; + } r; + + //extract exponent + r.f = x; + m = (r.i >> 23); + m = m - 127; + r.i = r.i - (m << 23); + + //Taylor Polynomial (Estrins) + xx = r.f * r.f; + a = (__logf_lut[4] * r.f) + (__logf_lut[0]); + b = (__logf_lut[6] * r.f) + (__logf_lut[2]); + c = (__logf_lut[5] * r.f) + (__logf_lut[1]); + d = (__logf_lut[7] * r.f) + (__logf_lut[3]); + a = a + b * xx; + c = c + d * xx; + xx = xx * xx; + r.f = a + c * xx; + + //add exponent + r.f = r.f + ((float) m) * __logf_rng; + + return r.f; +} + +float logf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; + + //extract exponent + "vmov.i32 d2, #127 \n\t" //d2 = 127; + "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; + "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; + "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; + "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; + + //polynomial: + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} + "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; + "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] + "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] + "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} + "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] + + //add exponent + "vdup.32 d7, %0 \n\t" //d7 = {rng, rng} + "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 + "vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7 + + "vmov.f32 s0, s4 \n\t" //s0 = s4 + + :: "r"(__logf_rng), "r"(__logf_lut) + : "d0", "d1", "q1", "q2", "d6", "d7" + ); +#endif +} + +float logf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + logf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return logf_c(x); +#endif +}; + diff --git a/source/math_mat2.c b/source/math_mat2.c new file mode 100644 index 0000000000..0baad4b771 --- /dev/null +++ b/source/math_mat2.c @@ -0,0 +1,95 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Matrices are specified in column major format: + +| a c | +| b d | + +therefore m[2] = c +*/ + +#include "math_neon.h" + +//matrix matrix multipication. d = m0 * m1; +void +matmul2_c(float m0[4], float m1[4], float d[4]) +{ + d[0] = m0[0]*m1[0] + m0[2]*m1[1]; + d[1] = m0[1]*m1[0] + m0[3]*m1[1]; + d[2] = m0[0]*m1[2] + m0[2]*m1[3]; + d[3] = m0[1]*m1[2] + m0[3]*m1[3]; +} + +void +matmul2_neon(float m0[4], float m1[4], float d[4]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d0, d1}, [%0] \n\t" //Q1 = m0 + "vld1.32 {d2, d3}, [%1] \n\t" //Q2 = m1 + + "vmul.f32 d4, d0, d2[0] \n\t" //D4 = D0*D2[0] + "vmul.f32 d5, d0, d3[0] \n\t" //D5 = D0*D3[0] + "vmla.f32 d4, d1, d2[1] \n\t" //D4 += D1*D2[1] + "vmla.f32 d5, d1, d3[1] \n\t" //D5 += D1*D3[1] + + "vst1.32 {d4, d5}, [%2] \n\t" //Q4 = m+12 + :: "r"(m0), "r"(m1), "r"(d) + : "q0", "q1", "q2", "memory" + ); +#else + matmul2_c(m0, m1, d); +#endif +} + + +//matrix vector multiplication. d = m * v +void +matvec2_c(float m[4], float v[2], float d[2]) +{ + d[0] = m[0]*v[0] + m[2]*v[1]; + d[1] = m[1]*v[0] + m[3]*v[1]; +} + +void +matvec2_neon(float m[4], float v[2], float d[2]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 d0, [%1] \n\t" //d0 = v + "vld1.32 {d1, d2}, [%0] \n\t" //Q1 = m + + "vmul.f32 d3, d1, d0[0] \n\t" //Q5 = Q1*d0[0] + "vmla.f32 d3, d2, d0[1] \n\t" //Q5 += Q1*d0[1] + + "vst1.32 d3, [%2] \n\t" //Q4 = m+12 + :: "r"(m), "r"(v), "r"(d) + : "d0", "d1", "d2","d3", "memory" + ); +#else + matvec2_c(m, v, d); +#endif +} diff --git a/source/math_mat3.c b/source/math_mat3.c new file mode 100644 index 0000000000..aae178e179 --- /dev/null +++ b/source/math_mat3.c @@ -0,0 +1,131 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Matrices are specified in row major format: + +| x0 x2 | +| x1 x3 | + +therefore m[2] = x2 + +*/ + +#include "math_neon.h" + +//matrix matrix multipication. d = m0 * m1; +void +matmul3_c(float m0[9], float m1[9], float d[9]) +{ + d[0] = m0[0]*m1[0] + m0[3]*m1[1] + m0[6]*m1[2]; + d[1] = m0[1]*m1[0] + m0[4]*m1[1] + m0[7]*m1[2]; + d[2] = m0[2]*m1[0] + m0[5]*m1[1] + m0[8]*m1[2]; + d[3] = m0[0]*m1[3] + m0[3]*m1[4] + m0[6]*m1[5]; + d[4] = m0[1]*m1[3] + m0[4]*m1[4] + m0[7]*m1[5]; + d[5] = m0[2]*m1[3] + m0[5]*m1[4] + m0[8]*m1[5]; + d[6] = m0[0]*m1[6] + m0[3]*m1[7] + m0[6]*m1[8]; + d[7] = m0[1]*m1[6] + m0[4]*m1[7] + m0[7]*m1[8]; + d[8] = m0[2]*m1[6] + m0[5]*m1[7] + m0[8]*m1[8]; +} + +void +matmul3_neon(float m0[9], float m1[9], float d[9]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 + "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 + "flds s8, [%1] \n\t" //q2 = m1+8 + + "vld1.32 {d6, d7}, [%0] \n\t" //q3[0] = m0 + "add %0, %0, #12 \n\t" //q3[0] = m0 + "vld1.32 {d8, d9}, [%0] \n\t" //q4[0] = m0+12 + "add %0, %0, #12 \n\t" //q3[0] = m0 + "vld1.32 {d10}, [%0] \n\t" //q5[0] = m0+24 + "add %0, %0, #8 \n\t" //q3[0] = m0 + "flds s22, [%0] \n\t" //q2 = m1+8 + + "vmul.f32 q6, q3, d0[0] \n\t" //q12 = q3 * d0[0] + "vmul.f32 q7, q3, d1[1] \n\t" //q13 = q3 * d2[0] + "vmul.f32 q8, q3, d3[0] \n\t" //q14 = q3 * d4[0] + "vmla.f32 q6, q4, d0[1] \n\t" //q12 = q9 * d0[1] + "vmla.f32 q7, q4, d2[0] \n\t" //q13 = q9 * d2[1] + "vmla.f32 q8, q4, d3[1] \n\t" //q14 = q9 * d4[1] + "vmla.f32 q6, q5, d1[0] \n\t" //q12 = q10 * d0[0] + "vmla.f32 q7, q5, d2[1] \n\t" //q13 = q10 * d2[0] + "vmla.f32 q8, q5, d4[0] \n\t" //q14 = q10 * d4[0] + + "vmov.f32 q0, q8 \n\t" //q14 = q10 * d4[0] + "vst1.32 {d12, d13}, [%2] \n\t" //d = q12 + "add %2, %2, #12 \n\t" //q3[0] = m0 + "vst1.32 {d14, d15}, [%2] \n\t" //d+4 = q13 + "add %2, %2, #12 \n\t" //q3[0] = m0 + "vst1.32 {d0}, [%2] \n\t" //d+8 = q14 + "add %2, %2, #8 \n\t" //q3[0] = m0 + "fsts s2, [%2] \n\t" //d = q12 + + : "+r"(m0), "+r"(m1), "+r"(d): + : "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory" + ); +#else + matmul3_c(m0, m1, d); +#endif +}; + +//matrix vector multiplication. d = m * v +void +matvec3_c(float m[9], float v[3], float d[3]) +{ + d[0] = m[0]*v[0] + m[3]*v[1] + m[6]*v[2]; + d[1] = m[1]*v[0] + m[4]*v[1] + m[7]*v[2]; + d[2] = m[2]*v[0] + m[5]*v[1] + m[8]*v[2]; +} + +void +matvec3_neon(float m[9], float v[3], float d[3]) +{ +#ifdef __MATH_NEON + int tmp; + asm volatile ( + "mov %3, #12 \n\t" //r3 = 12 + "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v + "vld1.32 {d2, d3}, [%0], %3 \n\t" //Q1 = m + "vld1.32 {d4, d5}, [%0], %3 \n\t" //Q2 = m+12 + "vld1.32 {d6, d7}, [%0], %3 \n\t" //Q3 = m+24 + + "vmul.f32 q9, q1, d0[0] \n\t" //Q9 = Q1*Q0[0] + "vmla.f32 q9, q2, d0[1] \n\t" //Q9 += Q2*Q0[1] + "vmla.f32 q9, q3, d1[0] \n\t" //Q9 += Q3*Q0[2] + "vmov.f32 q0, q9 \n\t" //Q0 = q9 + + "vst1.32 d0, [%2]! \n\t" //r2 = D24 + "fsts s2, [%2] \n\t" //r2 = D25[0] + + : "+r"(m), "+r"(v), "+r"(d), "+r"(tmp): + : "q0", "q9", "q10","q11", "q12", "q13", "memory" + ); +#else + matvec3_c(m, v, d); +#endif +} diff --git a/source/math_mat4.c b/source/math_mat4.c new file mode 100644 index 0000000000..5bcf34b596 --- /dev/null +++ b/source/math_mat4.c @@ -0,0 +1,144 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Matrices are specified in row major format: + +| x0 x2 | +| x1 x3 | + +therefore m[2] = x2 + +*/ + +#include "math_neon.h" + +//matrix matrix multipication. d = m0 * m1; +void +matmul4_c(float m0[16], float m1[16], float d[16]) +{ + d[0] = m0[0]*m1[0] + m0[4]*m1[1] + m0[8]*m1[2] + m0[12]*m1[3]; + d[1] = m0[1]*m1[0] + m0[5]*m1[1] + m0[9]*m1[2] + m0[13]*m1[3]; + d[2] = m0[2]*m1[0] + m0[6]*m1[1] + m0[10]*m1[2] + m0[14]*m1[3]; + d[3] = m0[3]*m1[0] + m0[7]*m1[1] + m0[11]*m1[2] + m0[15]*m1[3]; + d[4] = m0[0]*m1[4] + m0[4]*m1[5] + m0[8]*m1[6] + m0[12]*m1[7]; + d[5] = m0[1]*m1[4] + m0[5]*m1[5] + m0[9]*m1[6] + m0[13]*m1[7]; + d[6] = m0[2]*m1[4] + m0[6]*m1[5] + m0[10]*m1[6] + m0[14]*m1[7]; + d[7] = m0[3]*m1[4] + m0[7]*m1[5] + m0[11]*m1[6] + m0[15]*m1[7]; + d[8] = m0[0]*m1[8] + m0[4]*m1[9] + m0[8]*m1[10] + m0[12]*m1[11]; + d[9] = m0[1]*m1[8] + m0[5]*m1[9] + m0[9]*m1[10] + m0[13]*m1[11]; + d[10] = m0[2]*m1[8] + m0[6]*m1[9] + m0[10]*m1[10] + m0[14]*m1[11]; + d[11] = m0[3]*m1[8] + m0[7]*m1[9] + m0[11]*m1[10] + m0[15]*m1[11]; + d[12] = m0[0]*m1[12] + m0[4]*m1[13] + m0[8]*m1[14] + m0[12]*m1[15]; + d[13] = m0[1]*m1[12] + m0[5]*m1[13] + m0[9]*m1[14] + m0[13]*m1[15]; + d[14] = m0[2]*m1[12] + m0[6]*m1[13] + m0[10]*m1[14] + m0[14]*m1[15]; + d[15] = m0[3]*m1[12] + m0[7]*m1[13] + m0[11]*m1[14] + m0[15]*m1[15]; +} + +void +matmul4_neon(float m0[16], float m1[16], float d[16]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1 + "vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4 + "vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8 + "vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12 + "vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0 + "vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4 + "vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8 + "vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12 + + "vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0] + "vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0] + "vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0] + "vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0] + "vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1] + "vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1] + "vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1] + "vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1] + "vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0] + "vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0] + "vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0] + "vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0] + "vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1] + "vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1] + "vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1] + "vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1] + + "vst1.32 {d24, d25}, [%2]! \n\t" //d = q12 + "vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13 + "vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14 + "vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15 + + : "+r"(m0), "+r"(m1), "+r"(d) : + : "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", + "memory" + ); +#else + matmul4_c(m0, m1, d); +#endif +} + + +//matrix vector multiplication. d = m * v +void +matvec4_c(float m[16], float v[4], float d[4]) +{ + d[0] = m[0]*v[0] + m[4]*v[1] + m[8]*v[2] + m[12]*v[3]; + d[1] = m[1]*v[0] + m[5]*v[1] + m[9]*v[2] + m[13]*v[3]; + d[2] = m[2]*v[0] + m[6]*v[1] + m[10]*v[2] + m[14]*v[3]; + d[3] = m[3]*v[0] + m[7]*v[1] + m[11]*v[2] + m[15]*v[3]; +} + +void +matvec4_neon(float m[16], float v[4], float d[4]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v + "vld1.32 {d18, d19}, [%0]! \n\t" //Q1 = m + "vld1.32 {d20, d21}, [%0]! \n\t" //Q2 = m+4 + "vld1.32 {d22, d23}, [%0]! \n\t" //Q3 = m+8 + "vld1.32 {d24, d25}, [%0]! \n\t" //Q4 = m+12 + + "vmul.f32 q13, q9, d0[0] \n\t" //Q5 = Q1*Q0[0] + "vmla.f32 q13, q10, d0[1] \n\t" //Q5 += Q1*Q0[1] + "vmla.f32 q13, q11, d1[0] \n\t" //Q5 += Q2*Q0[2] + "vmla.f32 q13, q12, d1[1] \n\t" //Q5 += Q3*Q0[3] + + "vst1.32 {d26, d27}, [%2] \n\t" //Q4 = m+12 + : + : "r"(m), "r"(v), "r"(d) + : "q0", "q9", "q10","q11", "q12", "q13", "memory" + ); +#else + matvec4_c(m, v, d); +#endif +} + + + + + diff --git a/source/math_modf.c b/source/math_modf.c new file mode 100644 index 0000000000..f3259710af --- /dev/null +++ b/source/math_modf.c @@ -0,0 +1,71 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Assumes the floating point value |x| < 2,147,483,648 +*/ + +#include "math_neon.h" + +float modf_c(float x, int *i) +{ + int n; + n = (int)x; + *i = n; + x = x - (float)n; + return x; +} + + +float modf_neon_hfp(float x, int *i) +{ +#ifdef __MATH_NEON + asm volatile ( + "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; + "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; + "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; + "vstr.i32 s2, [r0] \n\t" //[r0] = d1[0] + ::: "d0", "d1", "d2" + ); +#endif +} + + +float modf_neon_sfp(float x, int *i) +{ +#ifdef __MATH_NEON + asm volatile ( + "vdup.f32 d0, r0 \n\t" //d0 = {x, x} + "vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0; + "vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1; + "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2; + "vstr.i32 s2, [r1] \n\t" //[r0] = d1[0] + "vmov.f32 r0, s0 \n\t" //r0 = d0[0]; + ::: "d0", "d1", "d2" + ); + +#else + return modf_c(x, i); +#endif +} diff --git a/source/math_neon.h b/source/math_neon.h new file mode 100644 index 0000000000..2db33acd87 --- /dev/null +++ b/source/math_neon.h @@ -0,0 +1,439 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#ifndef __MATH_NEON_H__ +#define __MATH_NEON_H__ + +#if !defined(__i386__) && defined(__arm__) +//if defined neon ASM routines are used, otherwise all calls to *_neon +//functions are rerouted to their equivalent *_c function. +#define __MATH_NEON + +//Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines. +//You can access the hardfp versions directly via the *_hard suffix. +//You can access the softfp versions directly via the *_soft suffix. +#define __MATH_FPABI 1 + +#endif + +#ifdef GCC +#define ALIGN(A) __attribute__ ((aligned (A)) +#else +#define ALIGN(A) +#endif + +#ifndef _MATH_H +#define M_PI 3.14159265358979323846 /* pi */ +#define M_PI_2 1.57079632679489661923 /* pi/2 */ +#define M_PI_4 0.78539816339744830962 /* pi/4 */ +#define M_E 2.7182818284590452354 /* e */ +#define M_LOG2E 1.4426950408889634074 /* log_2 e */ +#define M_LOG10E 0.43429448190325182765 /* log_10 e */ +#define M_LN2 0.69314718055994530942 /* log_e 2 */ +#define M_LN10 2.30258509299404568402 /* log_e 10 */ +#define M_1_PI 0.31830988618379067154 /* 1/pi */ +#define M_2_PI 0.63661977236758134308 /* 2/pi */ +#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */ +#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */ +#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */ +#endif + +#if __MATH_FPABI == 1 +#define sinf_neon sinf_neon_hfp +#define cosf_neon cosf_neon_hfp +#define sincosf_neon sincosf_neon_hfp +#define tanf_neon tanf_neon_hfp +#define atanf_neon atanf_neon_hfp +#define atan2f_neon atan2f_neon_hfp +#define asinf_neon asinf_neon_hfp +#define acosf_neon acosf_neon_hfp +#define sinhf_neon sinhf_neon_hfp +#define coshf_neon coshf_neon_hfp +#define tanhf_neon tanhf_neon_hfp +#define expf_neon expf_neon_hfp +#define logf_neon logf_neon_hfp +#define log10f_neon log10f_neon_hfp +#define powf_neon powf_neon_hfp +#define floorf_neon floorf_neon_hfp +#define ceilf_neon ceilf_neon_hfp +#define fabsf_neon fabsf_neon_hfp +#define ldexpf_neon ldexpf_neon_hfp +#define frexpf_neon frexpf_neon_hfp +#define fmodf_neon fmodf_neon_hfp +#define modf_neon modf_neon_hfp +#define sqrtf_neon sqrtf_neon_hfp +#define invsqrtf_neon invsqrtf_neon_hfp + +#define dot2_neon dot2_neon_hfp +#define dot3_neon dot3_neon_hfp +#define dot4_neon dot4_neon_hfp +#else +#define sinf_neon sinf_neon_sfp +#define cosf_neon cosf_neon_sfp +#define sincosf_neon sincosf_neon_sfp +#define tanf_neon tanf_neon_sfp +#define atanf_neon atanf_neon_sfp +#define atan2f_neon atan2f_neon_sfp +#define asinf_neon asinf_neon_sfp +#define acosf_neon acosf_neon_sfp +#define sinhf_neon sinhf_neon_sfp +#define coshf_neon coshf_neon_sfp +#define tanhf_neon tanhf_neon_sfp +#define expf_neon expf_neon_sfp +#define logf_neon logf_neon_sfp +#define log10f_neon log10f_neon_sfp +#define powf_neon powf_neon_sfp +#define floorf_neon floorf_neon_sfp +#define ceilf_neon ceilf_neon_sfp +#define fabsf_neon fabsf_neon_sfp +#define ldexpf_neon ldexpf_neon_sfp +#define frexpf_neon frexpf_neon_sfp +#define fmodf_neon fmodf_neon_sfp +#define modf_neon modf_neon_sfp +#define sqrtf_neon sqrtf_neon_sfp +#define invsqrtf_neon invsqrtf_neon_sfp + +#define dot2_neon dot2_neon_sfp +#define dot3_neon dot3_neon_sfp +#define dot4_neon dot4_neon_sfp +#endif + +/* +function: enable_runfast + this function enables the floating point runfast mode on the + ARM Cortex A8. +*/ +void enable_runfast(); + + +float dot2_c(float v0[2], float v1[2]); +float dot2_neon(float v0[2], float v1[2]); +float dot3_c(float v0[3], float v1[3]); +float dot3_neon(float v0[3], float v1[3]); +float dot4_c(float v0[4], float v1[4]); +float dot4_neon(float v0[4], float v1[4]); + +void cross3_c(float v0[3], float v1[3], float d[3]); +void cross3_neon(float v0[3], float v1[3], float d[3]); + +void normalize2_c(float v[2], float d[2]); +void normalize2_neon(float v[2], float d[2]); +void normalize3_c(float v[3], float d[3]); +void normalize3_neon(float v[3], float d[3]); +void normalize4_c(float v[4], float d[4]); +void normalize4_neon(float v[4], float d[4]); + +/* +function: matmul2 +arguments: m0 2x2 matrix, m1 2x2 matrix +return: d 2x2 matrix +expression: d = m0 * m1 +*/ +void matmul2_c(float m0[4], float m1[4], float d[4]); +void matmul2_neon(float m0[4], float m1[4], float d[4]); + +/* +function: matmul3 +arguments: m0 3x3 matrix, m1 3x3 matrix +return: d 3x3 matrix +expression: d = m0 * m1 +*/ +void matmul3_c(float m0[9], float m1[9], float d[9]); +void matmul3_neon(float m0[9], float m1[9], float d[9]); + +/* +function: matmul4 +arguments: m0 4x4 matrix, m1 4x4 matrix +return: d 4x4 matrix +expression: d = m0 * m1 +*/ +void matmul4_c(float m0[16], float m1[16], float d[16]); +void matmul4_neon(float m0[16], float m1[16], float d[16]); + +/* +function: matvec2 +arguments: m 2x2 matrix, v 2 element vector +return: d 2x2 matrix +expression: d = m * v +*/ +void matvec2_c(float m[4], float v[2], float d[2]); +void matvec2_neon(float m[4], float v[2], float d[2]); + +/* +function: matvec3 +arguments: m 3x3 matrix, v 3 element vector +return: d 3x3 matrix +expression: d = m * v +*/ +void matvec3_c(float m[9], float v[3], float d[3]); +void matvec3_neon(float m[9], float v[3], float d[3]); + +/* +function: matvec4 +arguments: m 4x4 matrix, v 4 element vector +return: d 4x4 matrix +expression: d = m * v +*/ +void matvec4_c(float m[16], float v[4], float d[4]); +void matvec4_neon(float m[16], float v[4], float d[4]); + +/* +function: sinf +arguments: x radians +return: the sine function evaluated at x radians. +expression: r = sin(x) +*/ +float sinf_c(float x); +float sinf_neon_hfp(float x); +float sinf_neon_sfp(float x); + +/* +function: cosf +arguments: x radians +return: the cosine function evaluated at x radians. +expression: r = cos(x) +notes: computed using cos(x) = sin(x + pi/2) +*/ +float cosf_c(float x); +float cosf_neon_hfp(float x); +float cosf_neon_sfp(float x); + +/* +function: sincosf +arguments: x radians, r[2] result array. +return: both the sine and the cosine evaluated at x radians. +expression: r = {sin(x), cos(x)} +notes: faster than evaluating seperately. +*/ +void sincosf_c(float x, float r[2]); +void sincosf_neon_hfp(float x, float r[2]); +void sincosf_neon_sfp(float x, float r[2]); + +/* +function: sinfv +return: the sine function evaluated at x[i] radians +expression: r[i] = sin(x[i]) +notes: faster than evaluating individually. + r and x can be the same memory location. +*/ +void sinfv_c(float *x, int n, float *r); +void sinfv_neon(float *x, int n, float *r); + +/* +function: tanf +return: the tangent evaluated at x radians. +expression: r = tan(x) +notes: computed using tan(x) = sin(x) / cos(x) +*/ +float tanf_c(float x); +float tanf_neon_hfp(float x); +float tanf_neon_sfp(float x); + +/* +function: atanf +return: the arctangent evaluated at x. +expression: r = atan(x) +*/ +float atanf_c(float x); +float atanf_neon_hfp(float x); +float atanf_neon_sfp(float x); + +/* +function: atanf +return: the arctangent evaluated at x. +expression: r = atan(x) +*/ +float atan2f_c(float y, float x); +float atan2f_neon_hfp(float y, float x); +float atan2f_neon_sfp(float y, float x); + +/* +function: asinf +return: the arcsine evaluated at x. +expression: r = asin(x) +*/ +float asinf_c(float x); +float asinf_neon_hfp(float x); +float asinf_neon_sfp(float x); + +/* +function: acosf +return: the arcsine evaluated at x. +expression: r = asin(x) +*/ +float acosf_c(float x); +float acosf_neon_hfp(float x); +float acosf_neon_sfp(float x); + +/* +function: sinhf +return: the arcsine evaluated at x. +expression: r = asin(x) +*/ +float sinhf_c(float x); +float sinhf_neon_hfp(float x); +float sinhf_neon_sfp(float x); + +/* +function: coshf +return: the arcsine evaluated at x. +expression: r = asin(x) +*/ +float coshf_c(float x); +float coshf_neon_hfp(float x); +float coshf_neon_sfp(float x); + +/* +function: tanhf +return: the arcsine evaluated at x. +expression: r = asin(x) +*/ +float tanhf_c(float x); +float tanhf_neon_hfp(float x); +float tanhf_neon_sfp(float x); + +/* +function: expf +return: the natural exponential evaluated at x. +expression: r = e ** x +*/ +float expf_c(float x); +float expf_neon_hfp(float x); +float expf_neon_sfp(float x); + +/* +function: logf +return: the value of the natural logarithm of x. +expression: r = ln(x) +notes: assumes x > 0 +*/ +float logf_c(float x); +float logf_neon_hfp(float x); +float logf_neon_sfp(float x); + +/* +function: log10f +return: the value of the power 10 logarithm of x. +expression: r = log10(x) +notes: assumes x > 0 +*/ +float log10f_c(float x); +float log10f_neon_hfp(float x); +float log10f_neon_sfp(float x); + +/* +function: powf +return: x raised to the power of n, x ** n. +expression: r = x ** y +notes: computed using e ** (y * ln(x)) +*/ +float powf_c(float x, float n); +float powf_neon_sfp(float x, float n); +float powf_neon_hfp(float x, float n); + +/* +function: floorf +return: x rounded down (towards negative infinity) to its nearest + integer value. +notes: assumes |x| < 2 ** 31 +*/ +float floorf_c(float x); +float floorf_neon_sfp(float x); +float floorf_neon_hfp(float x); + +/* +function: ceilf +return: x rounded up (towards positive infinity) to its nearest + integer value. +notes: assumes |x| < 2 ** 31 +*/ +float ceilf_c(float x); +float ceilf_neon_hfp(float x); +float ceilf_neon_sfp(float x); + +/* +function: fabsf +return: absolute vvalue of x +notes: assumes |x| < 2 ** 31 +*/ +float fabsf_c(float x); +float fabsf_neon_hfp(float x); +float fabsf_neon_sfp(float x); + +/* +function: ldexpf +return: the value of m multiplied by 2 to the power of e. +expression: r = m * (2 ** e) +*/ +float ldexpf_c(float m, int e); +float ldexpf_neon_hfp(float m, int e); +float ldexpf_neon_sfp(float m, int e); + +/* +function: frexpf +return: the exponent and mantissa of x +*/ +float frexpf_c(float x, int *e); +float frexpf_neon_hfp(float x, int *e); +float frexpf_neon_sfp(float x, int *e); + +/* +function: fmodf +return: the remainder of x divided by y, x % y +expression: r = x - floor(x / y) * y; +notes: assumes that |x / y| < 2 ** 31 +*/ +float fmodf_c(float x, float y); +float fmodf_neon_hfp(float x, float y); +float fmodf_neon_sfp(float x, float y); + +/* +function: modf +return: breaks x into the integer (i) and fractional part (return) +notes: assumes that |x| < 2 ** 31 +*/ +float modf_c(float x, int *i); +float modf_neon_hfp(float x, int *i); +float modf_neon_sfp(float x, int *i); + +/* +function: sqrtf +return: (x^0.5) +notes: +*/ +float sqrtf_c(float x); +float sqrtf_neon_hfp(float x); +float sqrtf_neon_sfp(float x); + + +/* +function: invsqrtf +return: 1.0f / (x^0.5) +notes: +*/ +float invsqrtf_c(float x); +float invsqrtf_neon_hfp(float x); +float invsqrtf_neon_sfp(float x); + +#endif diff --git a/source/math_powf.c b/source/math_powf.c new file mode 100644 index 0000000000..6faed4eeac --- /dev/null +++ b/source/math_powf.c @@ -0,0 +1,182 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Based on x ^ n = exp(n * log(x)) + +Test func : powf(x, n) +Test Range: (1,1) < (x, n) < (10, 10) +Peak Error: ~0.0010% +RMS Error: ~0.0002% +*/ + +#include "math.h" +#include "math_neon.h" + +const float __powf_rng[2] = { + 1.442695041f, + 0.693147180f +}; + +const float __powf_lut[16] = { + -2.295614848256274, //p0 log + -2.470711633419806, //p4 + -5.686926051100417, //p2 + -0.165253547131978, //p6 + +5.175912446351073, //p1 + +0.844006986174912, //p5 + +4.584458825456749, //p3 + +0.014127821926000, //p7 + 0.9999999916728642, //p0 exp + 0.04165989275009526, //p4 + 0.5000006143673624, //p2 + 0.0014122663401803872, //p6 + 1.000000059694879, //p1 + 0.008336936973260111, //p5 + 0.16666570253074878, //p3 + 0.00019578093328483123 //p7 +}; + +float powf_c(float x, float n) +{ + float a, b, c, d, xx; + int m; + + union { + float f; + int i; + } r; + + //extract exponent + r.f = x; + m = (r.i >> 23); + m = m - 127; + r.i = r.i - (m << 23); + + //Taylor Polynomial (Estrins) + xx = r.f * r.f; + a = (__powf_lut[4] * r.f) + (__powf_lut[0]); + b = (__powf_lut[6] * r.f) + (__powf_lut[2]); + c = (__powf_lut[5] * r.f) + (__powf_lut[1]); + d = (__powf_lut[7] * r.f) + (__powf_lut[3]); + a = a + b * xx; + c = c + d * xx; + xx = xx * xx; + r.f = a + c * xx; + + //add exponent + r.f = r.f + ((float) m) * __powf_rng[1]; + + r.f = r.f * n; + + + //Range Reduction: + m = (int) (r.f * __powf_rng[0]); + r.f = r.f - ((float) m) * __powf_rng[1]; + + //Taylor Polynomial (Estrins) + a = (__powf_lut[12] * r.f) + (__powf_lut[8]); + b = (__powf_lut[14] * r.f) + (__powf_lut[10]); + c = (__powf_lut[13] * r.f) + (__powf_lut[9]); + d = (__powf_lut[15] * r.f) + (__powf_lut[11]); + xx = r.f * r.f; + a = a + b * xx; + c = c + d * xx; + xx = xx* xx; + r.f = a + c * xx; + + //multiply by 2 ^ m + m = m << 23; + r.i = r.i + m; + + return r.f; +} + +float powf_neon_hfp(float x, float n) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d16, d0[1] \n\t" //d16 = {y,y}; + "vdup.f32 d0, d0[0] \n\t" //d0 = {x,x}; + + //extract exponent + "vmov.i32 d2, #127 \n\t" //d2 = 127; + "vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23; + "vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2; + "vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23; + "vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1; + + //polynomial: + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} + "vld1.32 {d2, d3, d4, d5}, [%1]! \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; + "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] + "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] + "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} + "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] + + //add exponent + "vld1.32 d7, [%0] \n\t" //d7 = {invrange, range} + "vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6 + "vmla.f32 d2, d6, d7[1] \n\t" //d2 = d2 + d6 * d7[1] + + "vdup.f32 d0, d2[0] \n\t" //d0 = d2[0] + "vmul.f32 d0, d0, d16 \n\t" //d0 = d0 * d16 + + //Range Reduction: + "vmul.f32 d6, d0, d7[0] \n\t" //d6 = d0 * d7[0] + "vcvt.u32.f32 d6, d6 \n\t" //d6 = (int) d6 + "vcvt.f32.u32 d1, d6 \n\t" //d1 = (float) d6 + "vmls.f32 d0, d1, d7[1] \n\t" //d0 = d0 - d1 * d7[1] + + //polynomial: + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2} + "vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ; + "vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0] + "vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0] + "vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4} + "vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1] + + //multiply by 2 ^ m + "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 + "vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6 + + + :: "r"(__powf_rng), "r"(__powf_lut) + : "d0", "d1", "d2","d3", "d4", "d5", "d6", "d7" + ); +#endif +} + +float powf_neon_sfp(float x, float n) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + asm volatile ("vmov.f32 s1, r1 \n\t"); + powf_neon_hfp(x, n); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return powf_c(x, n); +#endif +}; diff --git a/source/math_runfast.c b/source/math_runfast.c new file mode 100644 index 0000000000..0d06c0bfc8 --- /dev/null +++ b/source/math_runfast.c @@ -0,0 +1,42 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +void +enable_runfast() +{ +#ifdef __arm__ + static const unsigned int x = 0x04086060; + static const unsigned int y = 0x03000000; + int r; + asm volatile ( + "fmrx %0, fpscr \n\t" //r0 = FPSCR + "and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060 + "orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000 + "fmxr fpscr, %0 \n\t" //FPSCR = r0 + : "=r"(r) + : "r"(x), "r"(y) + ); +#endif +} diff --git a/source/math_sincosf.c b/source/math_sincosf.c new file mode 100644 index 0000000000..365826f8ff --- /dev/null +++ b/source/math_sincosf.c @@ -0,0 +1,163 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __sincosf_rng[2] = { + 2.0 / M_PI, + M_PI / 2.0 +}; + +const float __sincosf_lut[8] = { + -0.00018365f, //p7 + -0.00018365f, //p7 + +0.00830636f, //p5 + +0.00830636f, //p5 + -0.16664831f, //p3 + -0.16664831f, //p3 + +0.99999661f, //p1 + +0.99999661f, //p1 +}; + +void sincosf_c( float x, float r[2]) +{ + union { + float f; + int i; + } ax, bx; + + float y; + float a, b, c, d, xx, yy; + int m, n, o, p; + + y = x + __sincosf_rng[1]; + ax.f = fabsf(x); + bx.f = fabsf(y); + + //Range Reduction: + m = (int) (ax.f * __sincosf_rng[0]); + o = (int) (bx.f * __sincosf_rng[0]); + ax.f = ax.f - (((float)m) * __sincosf_rng[1]); + bx.f = bx.f - (((float)o) * __sincosf_rng[1]); + + //Test Quadrant + n = m & 1; + p = o & 1; + ax.f = ax.f - n * __sincosf_rng[1]; + bx.f = bx.f - p * __sincosf_rng[1]; + m = m >> 1; + o = o >> 1; + n = n ^ m; + p = p ^ o; + m = (x < 0.0); + o = (y < 0.0); + n = n ^ m; + p = p ^ o; + n = n << 31; + p = p << 31; + ax.i = ax.i ^ n; + bx.i = bx.i ^ p; + + //Taylor Polynomial + xx = ax.f * ax.f; + yy = bx.f * bx.f; + r[0] = __sincosf_lut[0]; + r[1] = __sincosf_lut[1]; + r[0] = r[0] * xx + __sincosf_lut[2]; + r[1] = r[1] * yy + __sincosf_lut[3]; + r[0] = r[0] * xx + __sincosf_lut[4]; + r[1] = r[1] * yy + __sincosf_lut[5]; + r[0] = r[0] * xx + __sincosf_lut[6]; + r[1] = r[1] * yy + __sincosf_lut[7]; + r[0] = r[0] * ax.f; + r[1] = r[1] * bx.f; + +} + +void sincosf_neon_hfp(float x, float r[2]) +{ +//HACK: Assumes for softfp that r1 = x, and for hardfp that s0 = x. +#ifdef __MATH_NEON + asm volatile ( + //{x, y} = {x, x + pi/2} + "vdup.f32 d1, d0[0] \n\t" //d1 = {x, x} + "vld1.32 d3, [%1] \n\t" //d3 = {invrange, range} + "vadd.f32 d0, d1, d3 \n\t" //d0 = d1 + d3 + "vmov.f32 s0, s2 \n\t" //d0[0] = d1[0] + "vabs.f32 d1, d0 \n\t" //d1 = {abs(x), abs(y)} + + //Range Reduction: + "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] + "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 + "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 + "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] + + //Checking Quadrant: + //ax = ax - (k&1) * M_PI_2 + "vmov.i32 d4, #1 \n\t" //d4 = 1 + "vand.i32 d4, d4, d2 \n\t" //d4 = d4 & d2 + "vcvt.f32.u32 d5, d4 \n\t" //d5 = (float) d4 + "vmls.f32 d1, d5, d3[1] \n\t" //d1 = d1 - d5 * d3[1] + + //ax = ax ^ ((k & 1) ^ (k >> 1) ^ (x < 0) << 31) + "vshr.u32 d3, d2, #1 \n\t" //d3 = d2 >> 1 + "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 + "vclt.f32 d3, d0, #0 \n\t" //d3 = (d0 < 0.0) + "veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3 + "vshl.i32 d4, d4, #31 \n\t" //d4 = d4 << 31 + "veor.i32 d0, d1, d4 \n\t" //d0 = d1 ^ d4 + + //polynomial: + "vldm %2!, {d2, d3} \n\t" //d2 = {p7, p7}, d3 = {p5, p5}, r3 += 4; + "vmul.f32 d1, d0, d0 \n\t" //d1 = d0 * d0 = {x^2, y^2} + "vldm %2!, {d4} \n\t" //d4 = {p3, p3}, r3 += 2; + "vmla.f32 d3, d2, d1 \n\t" //d3 = d3 + d2 * d1; + "vldm %2!, {d5} \n\t" //d5 = {p1, p1}, r3 += 2; + "vmla.f32 d4, d3, d1 \n\t" //d4 = d4 + d3 * d1; + "vmla.f32 d5, d4, d1 \n\t" //d5 = d5 + d4 * d1; + "vmul.f32 d5, d5, d0 \n\t" //d5 = d5 * d0; + + "vstm.f32 %0, {d5} \n\t" //r[0] = d5[0], r[1]=d5[1]; + + : "+r"(r) + : "r"(__sincosf_rng), "r"(__sincosf_lut) + : "d0", "d1", "d2", "d3", "d4", "d5" + ); +#else + sincosf_c(x, r); +#endif +} + +void sincosf_neon_sfp(float x, float r[2]) +{ +#ifdef __MATH_NEON + asm volatile ("vdup.f32 d0, r0 \n\t"); + sincosf_neon_hfp(x, r); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + sincosf_c(x, r); +#endif +}; + diff --git a/source/math_sinf.c b/source/math_sinf.c new file mode 100644 index 0000000000..257f219672 --- /dev/null +++ b/source/math_sinf.c @@ -0,0 +1,128 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include +#include "math_neon.h" + +static const float __sinf_rng[2] = { + 2.0 / M_PI, + M_PI / 2.0 +} ALIGN(16); + +static const float __sinf_lut[4] = { + -0.00018365f, //p7 + -0.16664831f, //p3 + +0.00830636f, //p5 + +0.99999661f, //p1 +} ALIGN(16); + +float sinf_c(float x) +{ + union { + float f; + int i; + } ax; + + float r, a, b, xx; + int m, n; + + ax.f = fabsf(x); + + //Range Reduction: + m = (int) (ax.f * __sinf_rng[0]); + ax.f = ax.f - (((float)m) * __sinf_rng[1]); + + //Test Quadrant + n = m & 1; + ax.f = ax.f - n * __sinf_rng[1]; + m = m >> 1; + n = n ^ m; + m = (x < 0.0); + n = n ^ m; + n = n << 31; + ax.i = ax.i ^ n; + + //Taylor Polynomial (Estrins) + xx = ax.f * ax.f; + a = (__sinf_lut[0] * ax.f) * xx + (__sinf_lut[2] * ax.f); + b = (__sinf_lut[1] * ax.f) * xx + (__sinf_lut[3] * ax.f); + xx = xx * xx; + r = b + a * xx; + + return r; +} + +float sinf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} + "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} + + "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] + "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 + "vmov.i32 d5, #1 \n\t" //d5 = 1 + "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 + "vshr.u32 d7, d2, #1 \n\t" //d7 = d2 >> 1 + "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] + + "vand.i32 d5, d2, d5 \n\t" //d5 = d2 & d5 + "vclt.f32 d18, d0, #0 \n\t" //d18 = (d0 < 0.0) + "vcvt.f32.u32 d6, d5 \n\t" //d6 = (float) d5 + "vmls.f32 d1, d6, d3[1] \n\t" //d1 = d1 - d6 * d3[1] + "veor.i32 d5, d5, d7 \n\t" //d5 = d5 ^ d7 + "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} + + "vld1.32 {d16, d17}, [%1] \n\t" //q8 = {p7, p3, p5, p1} + "veor.i32 d5, d5, d18 \n\t" //d5 = d5 ^ d18 + "vshl.i32 d5, d5, #31 \n\t" //d5 = d5 << 31 + "veor.i32 d1, d1, d5 \n\t" //d1 = d1 ^ d5 + + "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} + "vmul.f32 q0, q8, d1[0] \n\t" //q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x} + "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} + "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7} + + "vmov.f32 s0, s3 \n\t" //s0 = s3 + : + : "r"(__sinf_rng), "r"(__sinf_lut) + : "q0", "q1", "q2", "q3", "q8", "q9" + ); +#endif +} + +float sinf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vdup.f32 d0, r0 \n\t"); + sinf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return sinf_c(x); +#endif + +}; + diff --git a/source/math_sinfv.c b/source/math_sinfv.c new file mode 100644 index 0000000000..0dfc878170 --- /dev/null +++ b/source/math_sinfv.c @@ -0,0 +1,110 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __sinfv_rng[2] = { + 2.0 / M_PI, + M_PI / 2.0, +}; + +const float __sinfv_lut[4] = { + -0.00018365f, //p7 + -0.16664831f, //p3 + +0.00830636f, //p5 + +0.99999661f, //p1 +}; + +void sinfv_c(float *x, int n, float *r) +{ + union { + float f; + int i; + } ax, bx; + + float aa, ab, ba, bb, axx, bxx; + int am, bm, an, bn; + + if (n & 0x1) { + *r++ = sinf_c(*x++); + n--; + } + + float rng0 = __sinfv_rng[0]; + float rng1 = __sinfv_rng[1]; + + while(n > 0){ + + float x0 = *x++; + float x1 = *x++; + + ax.f = fabsf(x0); + bx.f = fabsf(x1); + + //Range Reduction: + am = (int) (ax.f * rng0); + bm = (int) (bx.f * rng0); + + ax.f = ax.f - (((float)am) * rng1); + bx.f = bx.f - (((float)bm) * rng1); + + //Test Quadrant + an = am & 1; + bn = bm & 1; + ax.f = ax.f - an * rng1; + bx.f = bx.f - bn * rng1; + am = (am & 2) >> 1; + bm = (bm & 2) >> 1; + ax.i = ax.i ^ ((an ^ am ^ (x0 < 0)) << 31); + bx.i = bx.i ^ ((bn ^ bm ^ (x1 < 0)) << 31); + + //Taylor Polynomial (Estrins) + axx = ax.f * ax.f; + bxx = bx.f * bx.f; + aa = (__sinfv_lut[0] * ax.f) * axx + (__sinfv_lut[2] * ax.f); + ba = (__sinfv_lut[0] * bx.f) * bxx + (__sinfv_lut[2] * bx.f); + ab = (__sinfv_lut[1] * ax.f) * axx + (__sinfv_lut[3] * ax.f); + bb = (__sinfv_lut[1] * bx.f) * bxx + (__sinfv_lut[3] * bx.f); + axx = axx * axx; + bxx = bxx * bxx; + *r++ = ab + aa * axx; + *r++ = bb + ba * bxx; + n -= 2; + } + + +} + +void sinfv_neon(float *x, int n, float *r) +{ +#ifdef __MATH_NEON + asm volatile ("" + : + :"r"(x), "r"(n) + ); +#else + sinfv_c(x, n, r); +#endif +} diff --git a/source/math_sinhf.c b/source/math_sinhf.c new file mode 100644 index 0000000000..820a490dae --- /dev/null +++ b/source/math_sinhf.c @@ -0,0 +1,120 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __sinhf_rng[2] = { + 1.442695041f, + 0.693147180f +}; + +const float __sinhf_lut[16] = { + 0.00019578093328483123, //p7 + 0.00019578093328483123, //p7 + 0.0014122663401803872, //p6 + 0.0014122663401803872, //p6 + 0.008336936973260111, //p5 + 0.008336936973260111, //p5 + 0.04165989275009526, //p4 + 0.04165989275009526, //p4 + 0.16666570253074878, //p3 + 0.16666570253074878, //p3 + 0.5000006143673624, //p2 + 0.5000006143673624, //p2 + 1.000000059694879, //p1 + 1.000000059694879, //p1 + 0.9999999916728642, //p0 + 0.9999999916728642 //p0 +}; + + +float sinhf_c(float x) +{ + float a, b, xx; + xx = -x; + a = expf_c(x); + b = expf_c(xx); + a = a - b; + a = a * 0.5f; + return a; +} + + +float sinhf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} + "fnegs s1, s1 \n\t" //s1 = -s1 + + //Range Reduction: + "vld1.32 d2, [%0] \n\t" //d2 = {invrange, range} + "vld1.32 {d16, d17}, [%1]! \n\t" + "vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0] + "vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6 + "vld1.32 {d18}, [%1]! \n\t" + "vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6 + "vld1.32 {d19}, [%1]! \n\t" + "vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1] + "vld1.32 {d20}, [%1]! \n\t" + + //polynomial: + "vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0; + "vld1.32 {d21}, [%1]! \n\t" + "vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0; + "vld1.32 {d22}, [%1]! \n\t" + "vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0; + "vld1.32 {d23}, [%1]! \n\t" + "vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0; + "vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0; + "vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0; + "vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0; + + //multiply by 2 ^ m + "vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23 + "vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6 + + "vdup.f32 d2, d0[1] \n\t" //d2 = s1 + "vmov.f32 d1, #0.5 \n\t" //d1 = 0.5 + "vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2 + "vmul.f32 d0, d1 \n\t" //d0 = d0 * d1 + + :: "r"(__sinhf_rng), "r"(__sinhf_lut) + : "d0", "d1", "q1", "q2", "d6" + ); + +#endif +} + +float sinhf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + sinhf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return sinhf_c(x); +#endif +}; diff --git a/source/math_sqrtf.c b/source/math_sqrtf.c new file mode 100644 index 0000000000..ee3f86bdbf --- /dev/null +++ b/source/math_sqrtf.c @@ -0,0 +1,105 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +/* +Test func : sqrtf(x) +Test Range: 0 < x < 1,000,000,000 +Peak Error: ~0.0010% +RMS Error: ~0.0005% +*/ + +#include "math.h" +#include "math_neon.h" + +float sqrtf_c(float x) +{ + + float b, c; + int m; + union { + float f; + int i; + } a; + + //fast invsqrt approx + a.f = x; + a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; //VRSQRTS + a.f = a.f * b; + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; + a.f = a.f * b; + + //fast inverse approx + x = a.f; + m = 0x3F800000 - (a.i & 0x7F800000); + a.i = a.i + m; + a.f = 1.41176471f - 0.47058824f * a.f; + a.i = a.i + m; + b = 2.0 - a.f * x; + a.f = a.f * b; + b = 2.0 - a.f * x; + a.f = a.f * b; + + return a.f; +} + +float sqrtf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + //fast invsqrt approx + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + + //fast reciporical approximation + "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; + + ::: "d0", "d1", "d2", "d3" + ); +#endif +} + +float sqrtf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + sqrtf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return sqrtf_c(x); +#endif +}; diff --git a/source/math_sqrtfv.c b/source/math_sqrtfv.c new file mode 100644 index 0000000000..c657db5d34 --- /dev/null +++ b/source/math_sqrtfv.c @@ -0,0 +1,147 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ +/* +Test func : sqrtf(x) +Test Range: 0 < x < 1,000,000,000 +Peak Error: ~0.0010% +RMS Error: ~0.0005% +*/ + +#include "math.h" +#include "math_neon.h" + +void sqrtfv_c(float *x, int n, float *r) +{ + + float x0, x1; + float b0, b1, c0, c1; + int m0, m1; + union { + float f; + int i; + } a0, a1; + + + if (n & 0x1){ + *r++ = sqrtf_c(*x++); + n--; + } + + while(n > 0){ + + x0 = *x++; + x1 = *x++; + + //fast invsqrt approx + a0.f = x0; + a1.f = x1; + a0.i = 0x5F3759DF - (a0.i >> 1); //VRSQRTE + a1.i = 0x5F3759DF - (a1.i >> 1); //VRSQRTE + c0 = x0 * a0.f; + c1 = x1 * a1.f; + b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS + b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS + a0.f = a0.f * b0; + a1.f = a1.f * b1; + c0 = x0 * a0.f; + c1 = x1 * a1.f; + b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS + b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS + a0.f = a0.f * b0; + a1.f = a1.f * b1; + + //fast inverse approx + c0 = a0.f; + c0 = a1.f; + m0 = 0x3F800000 - (a0.i & 0x7F800000); + m1 = 0x3F800000 - (a1.i & 0x7F800000); + a0.i = a0.i + m0; + a1.i = a1.i + m1; + a0.f = 1.41176471f - 0.47058824f * a0.f; + a1.f = 1.41176471f - 0.47058824f * a1.f; + a0.i = a0.i + m0; + a1.i = a1.i + m1; + b0 = 2.0 - a0.f * c0; + b1 = 2.0 - a1.f * c1; + a0.f = a0.f * b0; + a1.f = a1.f * b1; + b0 = 2.0 - a0.f * c0; + b1 = 2.0 - a1.f * c1; + a0.f = a0.f * b0; + a1.f = a1.f * b1; + + *r++ = a0.f; + *r++ = a1.f; + n -= 2; + + } +} + +void sqrtfv_neon(float *x, int n, float *r) +{ +#ifdef __MATH_NEON + asm volatile ( + + "tst r1, #1 \n\t" //r1 & 1 + "beq 1f \n\t" // + + "vld1.32 d0[0], [r0]! \n\t" //s0 = *x++ + "mov ip, lr \n\t" //ip = lr + //"bl sqrtf_neon_hfp \n\t" //sqrtf_neon + "mov lr, ip \n\t" //lr = ip + "vst1.32 d0[0], [r2]! \n\t" //*r++ = r0 + "subs r1, r1, #1 \n\t" //r1 = r1 - 1; + "bxeq lr \n\t" // + + "1: \n\t" // + + "vld1.32 d0, [r0]! \n\t" //d0 = (*x[0], *x[1]), x+=2; + + //fast invsqrt approx + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + "vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + //fast reciporical approximation + "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; + + "vst1.64 d0, [r2]! \n\t" //*r++ = d0; + "subs r1, r1, #2 \n\t" //n = n - 2; update flags + "bgt 1b \n\t" // + + ::: "d0", "d1", "d2", "d3" +); +#else + sqrtfv_c(x, n, r); +#endif +} diff --git a/source/math_tanf.c b/source/math_tanf.c new file mode 100644 index 0000000000..e87c1ffd1c --- /dev/null +++ b/source/math_tanf.c @@ -0,0 +1,156 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +const float __tanf_rng[2] = { + 2.0 / M_PI, + M_PI / 2.0 +}; + +const float __tanf_lut[4] = { + -0.00018365f, //p7 + -0.16664831f, //p3 + +0.00830636f, //p5 + +0.99999661f, //p1 +}; + +float tanf_c(float x){ + + union { + float f; + int i; + } ax, c; + + float r, a, b, xx, cc, cx; + int m; + + ax.f = fabsf(x); + + //Range Reduction: + m = (int) (ax.f * __tanf_rng[0]); + ax.f = ax.f - (((float)m) * __tanf_rng[1]); + + //Test Quadrant + ax.f = ax.f - (m & 1) * __tanf_rng[1]; + ax.i = ax.i ^ ((*(int*)&x) & 0x80000000); + + //Taylor Polynomial (Estrins) + xx = ax.f * ax.f; + a = (__tanf_lut[0] * ax.f) * xx + (__tanf_lut[2] * ax.f); + b = (__tanf_lut[1] * ax.f) * xx + (__tanf_lut[3] * ax.f); + xx = xx * xx; + r = b + a * xx; + + //cosine + c.f = 1.0 - r * r; + + //fast invsqrt approximation (2x newton iterations) + cc = c.f; + c.i = 0x5F3759DF - (c.i >> 1); //VRSQRTE + cx = cc * c.f; + a = (3.0f - cx * c.f) / 2; //VRSQRTS + c.f = c.f * a; + cx = cc * c.f; + a = (3.0f - cx * c.f) / 2; + c.f = c.f * a; + + r = r * c.f; + + return r; +} + + +float tanf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ( + + "vdup.f32 d0, d0[0] \n\t" //d0 = {x, x} + "vabs.f32 d1, d0 \n\t" //d1 = {ax, ax} + + //Range Reduction: + "vld1.32 d3, [%0] \n\t" //d3 = {invrange, range} + "vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0] + "vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2 + "vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2 + "vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1] + + //Checking Quadrant: + //ax = ax - (k&1) * M_PI_2 + "vmov.i32 d4, #1 \n\t" //d4 = 1 + "vand.i32 d2, d2, d4 \n\t" //d2 = d2 & d4 + "vcvt.f32.u32 d2, d2 \n\t" //d2 = (float) d2 + "vmls.f32 d1, d2, d3[1] \n\t" //d1 = d1 - d2 * d3[1] + + //ax = ax ^ ( x.i & 0x800000000) + "vmov.i32 d4, #0x80000000 \n\t" //d4 = 0x80000000 + "vand.i32 d0, d0, d4 \n\t" //d0 = d0 & d4 + "veor.i32 d1, d1, d0 \n\t" //d1 = d1 ^ d0 + + //polynomial: + "vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2} + "vld1.32 {d4, d5}, [%1] \n\t" //d4 = {p7, p3}, d5 = {p5, p1} + "vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4} + "vmul.f32 q0, q2, d1[0] \n\t" //q0 = q2 * d1[0] = {p7x, p3x, p5x, p1x} + "vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3} + "vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {..., p1x + p3x^3 + p5x^5 + p7x^7} + + //cosine + "vmov.f32 s1, #1.0 \n\t" //d0[1] = 1.0 + "vmls.f32 d0, d1, d1 \n\t" //d0 = {..., 1.0 - sx*sx} + + //invsqrt approx + "vmov.f32 d2, d0 \n\t" //d2 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 + "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 + "vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2 + "vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4 + + "vmul.f32 d0, d0, d1 \n\t" //d0 = d0 * d1 + + "vmov.f32 s0, s1 \n\t" //s0 = s1 + + :: "r"(__tanf_rng), "r"(__tanf_lut) + : "d0", "d1", "d2", "d3", "d4", "d5" + ); +#endif +} + + +float tanf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vdup.f32 d0, r0 \n\t"); + tanf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return tanf_c(x); +#endif +}; + diff --git a/source/math_tanhf.c b/source/math_tanhf.c new file mode 100644 index 0000000000..219655be4d --- /dev/null +++ b/source/math_tanhf.c @@ -0,0 +1,95 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math.h" +#include "math_neon.h" + +/* +TanH = (e^x - e^-x) / (e^x + e^-x) +TanH = (e^x - e^-x)(e^x) / (e^x + e^-x)(e^x) +TanH = (e^2x - 1) / (e^2x + 1) + +*/ + +float tanhf_c(float x) +{ + float a, b, c; + int m; + union{ + float f; + int i; + } xx; + + x = 2.0f * x; + a = expf_c(x); + c = a + 1.0f; + + //reciporical approx. + xx.f = c; + m = 0x3F800000 - (xx.i & 0x7F800000); + xx.i = xx.i + m; + xx.f = 1.41176471f - 0.47058824f * xx.f; + xx.i = xx.i + m; + b = 2.0 - xx.f * c; + xx.f = xx.f * b; + b = 2.0 - xx.f * c; + xx.f = xx.f * b; + c = a - 1.0; + xx.f *= c; + return xx.f; +} + + +float tanhf_neon_hfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vadd.f32 d0, d0, d0 \n\t"); + expf_neon_hfp(x); + asm volatile ( + "vmov.f32 d2, #1.0 \n\t" + "vsub.f32 d3, d0, d2 \n\t" + "vadd.f32 d0, d0, d2 \n\t" + + "vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2; + "vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0; + "vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2; + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3; + ::: "d0", "d1", "d2", "d3" + ); +#endif +} + +float tanhf_neon_sfp(float x) +{ +#ifdef __MATH_NEON + asm volatile ("vmov.f32 s0, r0 \n\t"); + tanhf_neon_hfp(x); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return tanhf_c(x); +#endif +}; + diff --git a/source/math_vec2.c b/source/math_vec2.c new file mode 100644 index 0000000000..d970c37676 --- /dev/null +++ b/source/math_vec2.c @@ -0,0 +1,118 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + + +#include "math_neon.h" + +//vec2 scalar product +float +dot2_c(float v0[2], float v1[2]) +{ + float r; + r = v0[0]*v1[0]; + r += v0[1]*v1[1]; + return r; +} + +void +normalize2_c(float v[2], float d[2]) +{ + float b, c, x; + union { + float f; + int i; + } a; + + x = v[0]*v[0]; + x += v[1]*v[1]; + + //fast invsqrt approx + a.f = x; + a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; //VRSQRTS + a.f = a.f * b; + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; + a.f = a.f * b; + + d[0] = v[0]*a.f; + d[1] = v[1]*a.f; +} + +float +dot2_neon_hfp(float v0[2], float v1[2]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} + "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} + "vmul.f32 d0, d2, d4 \n\t" //d0 = d2*d4 + "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] + :: "r"(v0), "r"(v1) + : + ); +#endif +} + +float +dot2_neon_sfp(float v0[2], float v1[2]) +{ +#ifdef __MATH_NEON + dot2_neon_hfp(v0, v1); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return dot2_c(v0, v1); +#endif +}; + +void +normalize2_neon(float v[2], float d[2]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 d4, [%0] \n\t" //d4 = {x0,y0} + "vmul.f32 d0, d4, d4 \n\t" //d0 = d2*d2 + "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + + "vmul.f32 d4, d4, d0[0] \n\t" //d4 = d4*d0[0] + "vst1.32 d4, [%1] \n\t" // + + :: "r"(v), "r"(d) + : "d0", "d1", "d2", "d3", "d4", "memory" + ); +#else + normalize2_c(v, d); +#endif +} + diff --git a/source/math_vec3.c b/source/math_vec3.c new file mode 100644 index 0000000000..998ff2e4d5 --- /dev/null +++ b/source/math_vec3.c @@ -0,0 +1,172 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math_neon.h" + +//vec4 scalar product +float +dot3_c(float v0[3], float v1[3]) +{ + float r; + r = v0[0]*v1[0]; + r += v0[1]*v1[1]; + r += v0[2]*v1[2]; + return r; +} + +void +cross3_c(float v0[3], float v1[3], float d[3]) +{ + d[0] = v0[1]*v1[2] - v0[2]*v1[1]; + d[1] = v0[2]*v1[0] - v0[0]*v1[2]; + d[2] = v0[0]*v1[1] - v0[1]*v1[0]; +} + +void +normalize3_c(float v[3], float d[3]) +{ + float b, c, x; + union { + float f; + int i; + } a; + + x = v[0]*v[0]; + x += v[1]*v[1]; + x += v[2]*v[2]; + + //fast invsqrt approx + a.f = x; + a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; //VRSQRTS + a.f = a.f * b; + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; + a.f = a.f * b; + + d[0] = v[0]*a.f; + d[1] = v[1]*a.f; + d[2] = v[2]*a.f; +} + + +float +dot3_neon_hfp(float v0[3], float v1[3]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d2}, [%0] \n\t" //d2={x0,y0} + "flds s6, [%0, #8] \n\t" //d3[0]={z0} + "vld1.32 {d4}, [%1] \n\t" //d4={x1,y1} + "flds s10, [%1, #8] \n\t" //d5[0]={z1} + + "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 + "vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1] + "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 + :: "r"(v0), "r"(v1) + : "d0","d1","d2","d3","d4","d5" + ); +#endif +} + +float +dot3_neon_sfp(float v0[3], float v1[3]) +{ +#ifdef __MATH_NEON + dot3_neon_hfp(v0, v1); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return dot3_c(v0, v1); +#endif +}; + + +void cross3_neon(float v0[3], float v1[3], float d[3]) +{ +#ifdef __MATH_NEON + asm volatile ( + "flds s3, [%0] \n\t" //d1[1]={x0} + "add %0, %0, #4 \n\t" // + "vld1.32 {d0}, [%0] \n\t" //d0={y0,z0} + "vmov.f32 s2, s1 \n\t" //d1[0]={z0} + + "flds s5, [%1] \n\t" //d2[1]={x1} + "add %1, %1, #4 \n\t" // + "vld1.32 {d3}, [%1] \n\t" //d3={y1,z1} + "vmov.f32 s4, s7 \n\t" //d2[0]=d3[1] + + "vmul.f32 d4, d0, d2 \n\t" //d4=d0*d2 + "vmls.f32 d4, d1, d3 \n\t" //d4-=d1*d3 + + "vmul.f32 d5, d3, d1[1] \n\t" //d5=d3*d1[1] + "vmls.f32 d5, d0, d2[1] \n\t" //d5-=d0*d2[1] + + "vst1.32 d4, [%2] \n\t" // + "add %2, %2, #8 \n\t" // + "fsts s10, [%2] \n\t" // + + : "+r"(v0), "+r"(v1), "+r"(d): + : "d0", "d1", "d2", "d3", "d4", "d5", "memory" + ); +#else + cross3_c(v0,v1,d); +#endif +} + +void +normalize3_neon(float v[3], float d[3]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d4}, [%0] \n\t" //d4={x0,y0} + "flds s10, [%0, #8] \n\t" //d5[0]={z0} + + "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 + "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] + "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 + "vst1.32 {d4}, [%1] \n\t" // + "fsts s10, [%1, #8] \n\t" // + + :: "r"(v), "r"(d) + : "d0", "d1", "d2", "d3", "d4", "d5", "memory" + ); +#else + normalize3_c(v, d); +#endif + +} + + diff --git a/source/math_vec4.c b/source/math_vec4.c new file mode 100644 index 0000000000..483fc57190 --- /dev/null +++ b/source/math_vec4.c @@ -0,0 +1,126 @@ +/* +The MIT License (MIT) + +Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com) + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +#include "math_neon.h" + + +#ifdef __MATH_NEON +#include "arm_neon.h" +#endif + +//vec4 scalar product +float dot4_c(float v0[4], float v1[4]) +{ + float r; + r = v0[0]*v1[0]; + r += v0[1]*v1[1]; + r += v0[2]*v1[2]; + r += v0[3]*v1[3]; + return r; +} + +void normalize4_c(float v[4], float d[4]) +{ + float b, c, x; + union { + float f; + int i; + } a; + + x = v[0]*v[0]; + x += v[1]*v[1]; + x += v[2]*v[2]; + x += v[3]*v[3]; + + //fast invsqrt approx + a.f = x; + a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; //VRSQRTS + a.f = a.f * b; + c = x * a.f; + b = (3.0f - c * a.f) * 0.5; + a.f = a.f * b; + + d[0] = v[0]*a.f; + d[1] = v[1]*a.f; + d[2] = v[2]*a.f; + d[3] = v[3]*a.f; +} + +void normalize4_neon(float v[4], float d[4]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d4, d5}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} + "vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4 + "vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5 + "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] + + "vmov.f32 d1, d0 \n\t" //d1 = d0 + "vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0) + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3 + "vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1 + "vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2 + "vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4 + + "vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4 + "vst1.32 {d4, d5}, [%1] \n\t" //d2={x0,y0}, d3={z0, w0} + + :: "r"(v), "r"(d) + : "d0", "d1", "d2", "d3", "d4", "d5", "memory" + ); +#else + normalize4_c(v, d); +#endif + +} + + +float dot4_neon_hfp(float v0[4], float v1[4]) +{ +#ifdef __MATH_NEON + asm volatile ( + "vld1.32 {d2, d3}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0} + "vld1.32 {d4, d5}, [%1] \n\t" //d4={x1,y1}, d5={z1, w1} + "vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4 + "vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5 + "vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1] + :: "r"(v0), "r"(v1) : + ); +#endif +} + +float dot4_neon_sfp(float v0[4], float v1[4]) +{ +#ifdef __MATH_NEON + dot4_neon_hfp(v0, v1); + asm volatile ("vmov.f32 r0, s0 \n\t"); +#else + return dot4_c(v0, v1); +#endif +}; +