Merge pull request #606 from aliaspider/master

(PSP) VFPU optimized audio resampler and s16 <-> float conversions
2025-03-29 22:20:21 +00:00 · 2014-03-14 17:02:04 +01:00 · 2014-03-14 17:02:04 +01:00 · 7fa8920357
commit 7fa8920357
parent c52c8cd5b5 035e9b0919
8 changed files with 286 additions and 1 deletions
--- a/audio/cc_resampler.c
+++ b/audio/cc_resampler.c
@ -0,0 +1,167 @@
+/*
+ * Convoluted Cosine Resampler
+ * Copyright (C) 2014 - Ali Bouhlel ( aliaspider@gmail.com )
+ *
+ * licence: GPLv3
+ */
+
+#include "resampler.h"
+#include "../libretro.h"
+#include "../performance.h"
+#include <math.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+
+#include "../general.h"
+
+typedef struct rarch_CC_resampler
+{
+   int dummy;
+}rarch_CC_resampler_t;
+
+typedef struct audio_frame_float
+{
+   float l;
+   float r;
+}audio_frame_float_t;
+
+typedef struct audio_frame_int16
+{
+   int16_t l;
+   int16_t r;
+}audio_frame_int16_t;
+
+
+#ifdef _MIPS_ARCH_ALLEGREX
+static void resampler_CC_process(void *re_, struct resampler_data *data)
+{
+   (void)re_;
+//   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
+
+
+   float ratio,fraction;
+
+
+   audio_frame_float_t *inp = (audio_frame_float_t*)data->data_in;
+   audio_frame_float_t *inp_max = inp + data->input_frames;
+   audio_frame_float_t *outp = (audio_frame_float_t*)data->data_out;
+
+   __asm__ (
+    ".set      push\n"
+    ".set      noreorder\n"
+
+    "mtv       %2,   s700              \n"   // 700 = data->ratio = b
+//    "vsat0.s   s700, s700              \n"
+    "vrcp.s    s701, s700              \n"   // 701 = 1.0 / b
+    "vadd.s    s702, s700, s700        \n"   // 702 = 2 * b
+    "vmul.s    s703, s700, s710        \n"   // 703 = b * pi
+
+    "mfv       %0,   s701              \n"
+    "mfv       %1,   s730              \n"
+
+    ".set      pop\n"
+    :"=r"(ratio),"=r"(fraction): "r"((float)data->ratio)
+   );
+
+   while(true)
+   {
+      while ((fraction < ratio))
+      {
+         __asm__ (
+         ".set      push               \n"
+         ".set      noreorder          \n"
+
+         "lv.s    s620, 0(%1)             \n"
+         "lv.s    s621, 4(%1)             \n"
+
+         "vsub.s  s731, s701, s730     \n"
+
+         "vadd.q  c600, c730[-X,Y,-X,Y], c730[1/2,1/2,-1/2,-1/2]\n"
+
+         "vmul.q  c610, c600, c700[Z,Z,Z,Z]  \n"   //*2*b
+         "vmul.q  c600, c600, c700[W,W,W,W]  \n"   //*b*pi
+         "vsin.q  c610, c610                 \n"
+         "vadd.q  c600, c600, c610           \n"
+
+         "vmul.q  c600[-1:1,-1:1,-1:1,-1:1], c600, c710[Y,Y,Y,Y]	\n"
+
+         "vsub.p  c600, c600, c602           \n"
+
+         "vmul.q  c620, c620[X,Y,X,Y], c600[X,X,Y,Y]  \n"
+
+         "vadd.q  c720, c720, c620           \n"
+
+
+         "vadd.s  s730, s730, s730[1]  \n"
+         "mfv     %0,   s730           \n"
+
+         ".set      pop         \n"
+         :"=r"(fraction): "r"(inp)
+         );
+         inp++;
+         if (inp == inp_max)
+            goto done;
+      }
+      __asm__ (
+      ".set    push                       \n"
+      ".set    noreorder                  \n"
+
+      "vmul.p  c720, c720, c720[1/2,1/2]  \n"
+      "sv.s    s720, 0(%1)                \n"
+      "sv.s    s721, 4(%1)                \n"
+      "vmov.q  c720, c720[Z,W,0,0]        \n"
+      "vsub.s  s730, s730, s701           \n"
+      "mfv     %0,   s730                 \n"
+
+      ".set    pop                        \n"
+      :"=r"(fraction): "r"(outp)
+      );
+      outp++;
+   }
+
+done:
+   data->output_frames = (outp - (audio_frame_float_t*)data->data_out);
+}
+#else
+#error "platform not supported"
+#endif
+
+static void resampler_CC_free(void *re_)
+{
+   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)re_;
+   if (re)
+      free(re);
+}
+
+static void *resampler_CC_init(double bandwidth_mod)
+{
+   rarch_CC_resampler_t *re = (rarch_CC_resampler_t*)calloc(1, sizeof(rarch_CC_resampler_t));
+   if (!re)
+      return NULL;
+
+   __asm__ (
+    ".set      push\n"
+    ".set      noreorder\n"
+
+    "vcst.s    s710, VFPU_PI           \n"   // 710 = pi
+    "vcst.s    s711, VFPU_1_PI         \n"   // 711 = 1.0 / (pi)
+
+    "vzero.q   c720                    \n"
+    "vzero.q   c730                    \n"
+
+    ".set      pop\n"
+   );
+
+   return re;
+}
+
+const rarch_resampler_t CC_resampler = {
+   resampler_CC_init,
+   resampler_CC_process,
+   resampler_CC_free,
+   "CC",
+};
+
+
+
--- a/audio/resampler.c
+++ b/audio/resampler.c
@ -24,6 +24,9 @@

 static const rarch_resampler_t *backends[] = {
   &sinc_resampler,
+#if defined(PSP)
+   &CC_resampler,
+#endif
   NULL,
 };

--- a/audio/resampler.h
+++ b/audio/resampler.h
@ -51,6 +51,9 @@ typedef struct rarch_resampler
 } rarch_resampler_t;

 extern const rarch_resampler_t sinc_resampler;
+#if defined(PSP)
+extern const rarch_resampler_t CC_resampler;
+#endif

 // Reallocs resampler. Will free previous handle before allocating a new one.
 // If ident is NULL, first resampler will be used.
--- a/audio/utils.c
+++ b/audio/utils.c
@ -163,6 +163,104 @@ static void audio_convert_float_to_s16_neon(int16_t *out, const float *in, size_
   audio_convert_float_to_s16_C(out + aligned_samples, in + aligned_samples,
         samples - aligned_samples);
 }
+#elif defined(_MIPS_ARCH_ALLEGREX)
+void audio_convert_s16_to_float_ALLEGREX(float *out,
+      const int16_t *in, size_t samples, float gain)
+{
+
+#ifdef DEBUG
+//   make sure the buffer is 16 byte aligned, this should be the default behaviour of malloc in the PSPSDK
+   rarch_assert(((uint32_t)out & 0xF) == 0);
+#endif
+   size_t i;
+   gain = gain / 0x8000;
+   __asm__ (
+   ".set    push                    \n"
+   ".set    noreorder               \n"
+   "mtv     %0, s200                \n"
+   ".set    pop                     \n"
+   ::"r"(gain)
+   );
+
+   for (i = 0; (i+16) <= samples; i+=16)
+   {
+      __asm__ (
+      ".set    push                 \n"
+      ".set    noreorder            \n"
+
+      "lv.s    s100,  0(%0)         \n"
+      "lv.s    s101,  4(%0)         \n"
+      "lv.s    s110,  8(%0)         \n"
+      "lv.s    s111, 12(%0)         \n"
+      "lv.s    s120, 16(%0)         \n"
+      "lv.s    s121, 20(%0)         \n"
+      "lv.s    s130, 24(%0)         \n"
+      "lv.s    s131, 28(%0)         \n"
+
+      "vs2i.p  c100, c100           \n"
+      "vs2i.p  c110, c110           \n"
+      "vs2i.p  c120, c120           \n"
+      "vs2i.p  c130, c130           \n"
+
+      "vi2f.q  c100, c100, 16       \n"
+      "vi2f.q  c110, c110, 16       \n"
+      "vi2f.q  c120, c120, 16       \n"
+      "vi2f.q  c130, c130, 16       \n"
+
+      "vmscl.q e100, e100, s200     \n"
+
+      "sv.q    c100,  0(%1)         \n"
+      "sv.q    c110, 16(%1)         \n"
+      "sv.q    c120, 32(%1)         \n"
+      "sv.q    c130, 48(%1)         \n"
+
+      ".set    pop                  \n"
+      ::"r"(in+i),"r"(out+i)
+      );
+   }
+
+   for (;i != samples; i++)
+      out[i] = (float)in[i] * gain;
+}
+
+void audio_convert_float_to_s16_ALLEGREX(int16_t *out,
+      const float *in, size_t samples)
+{
+#ifdef DEBUG
+//   make sure the buffers are 16 byte aligned, this should be the default behaviour of malloc in the PSPSDK
+   rarch_assert(((uint32_t)in  & 0xF) == 0);
+   rarch_assert(((uint32_t)out & 0xF) == 0);
+#endif
+
+   size_t i;
+   for (i = 0; (i+8) <= samples; i+=8)
+   {
+      __asm__ (
+      ".set    push                 \n"
+      ".set    noreorder            \n"
+
+      "lv.q    c100,  0(%0)         \n"
+      "lv.q    c110,  16(%0)        \n"
+
+      "vf2in.q c100, c100, 31       \n"
+      "vf2in.q c110, c110, 31       \n"
+      "vi2s.q  c100, c100           \n"
+      "vi2s.q  c102, c110           \n"
+
+      "sv.q    c100,  0(%1)         \n"
+
+      ".set    pop                  \n"
+      ::"r"(in+i),"r"(out+i)
+      );
+   }
+
+   for (;i != samples; i++)
+   {
+      int32_t val = (int32_t)(in[i] * 0x8000);
+      out[i] = (val > 0x7FFF) ? 0x7FFF : (val < -0x8000 ? -0x8000 : (int16_t)val);
+   }
+
+}
 #endif

 void audio_convert_init_simd(void)
--- a/audio/utils.h
+++ b/audio/utils.h
@ -51,7 +51,14 @@ void (*audio_convert_s16_to_float_arm)(float *out,
      const int16_t *in, size_t samples, float gain);
 void (*audio_convert_float_to_s16_arm)(int16_t *out,
      const float *in, size_t samples);
+#elif defined(_MIPS_ARCH_ALLEGREX)
+#define audio_convert_s16_to_float audio_convert_s16_to_float_ALLEGREX
+#define audio_convert_float_to_s16 audio_convert_float_to_s16_ALLEGREX

+void audio_convert_s16_to_float_ALLEGREX(float *out,
+      const int16_t *in, size_t samples, float gain);
+void audio_convert_float_to_s16_ALLEGREX(int16_t *out,
+      const float *in, size_t samples);
 #else
 #define audio_convert_s16_to_float audio_convert_s16_to_float_C
 #define audio_convert_float_to_s16 audio_convert_float_to_s16_C
--- a/config.def.h
+++ b/config.def.h
@ -447,7 +447,11 @@ static const int out_latency = 64;
 static const bool audio_sync = true;

 // Default resampler
+#if defined(PSP)
+static const char *audio_resampler = "CC";
+#else
 static const char *audio_resampler = "sinc";
+#endif

 // Experimental rate control
 #if defined(GEKKO) || !defined(RARCH_CONSOLE)
--- a/frontend/platform/platform_psp.c
+++ b/frontend/platform/platform_psp.c
@ -27,7 +27,7 @@
 #include "../../psp/sdk_defines.h"

 PSP_MODULE_INFO("RetroArch PSP", 0, 1, 1);
-PSP_MAIN_THREAD_ATTR(THREAD_ATTR_USER);
+PSP_MAIN_THREAD_ATTR(THREAD_ATTR_USER|THREAD_ATTR_VFPU);
 PSP_HEAP_SIZE_MAX();

 static int exit_callback(int arg1, int arg2, void *common)
--- a/griffin/griffin.c
+++ b/griffin/griffin.c
@ -358,6 +358,9 @@ AUDIO RESAMPLER
 ============================================================ */
 #include "../audio/resampler.c"
 #include "../audio/sinc.c"
+#ifdef PSP
+#include "../audio/cc_resampler.c"
+#endif

 /*============================================================
 CAMERA