diff --git a/gfx/drivers/ctr_gfx.c b/gfx/drivers/ctr_gfx.c
index 5f1e205ae8..09caa53539 100644
--- a/gfx/drivers/ctr_gfx.c
+++ b/gfx/drivers/ctr_gfx.c
@@ -202,7 +202,7 @@ static void* ctr_init(const video_info_t* video,
                             CTRGU_ATTRIBFMT(GPU_SHORT, 2) << 4,
                             sizeof(ctr_vertex_t));
    GPUCMD_Finalize();
-   GPUCMD_FlushAndRun(NULL);
+   ctrGuFlushAndRun(true);
    gspWaitForEvent(GSPEVENT_P3D, false);
 
    if (input && input_data)
@@ -214,7 +214,7 @@ static void* ctr_init(const video_info_t* video,
 
    return ctr;
 }
-//#define gspWaitForEvent(...)
+
 static bool ctr_frame(void* data, const void* frame,
       unsigned width, unsigned height, unsigned pitch, const char* msg)
 {
@@ -249,17 +249,17 @@ static bool ctr_frame(void* data, const void* frame,
    }
 
    frames++;
-   currentTick = osGetTime();
+   currentTick = svcGetSystemTick();
    uint32_t diff = currentTick - lastTick;
-   if(diff > 1000)
+   if(diff > CTR_CPU_TICKS_PER_SECOND)
    {
-      fps = (float)frames * (1000.0 / diff);
+      fps = (float)frames * ((float) CTR_CPU_TICKS_PER_SECOND / (float) diff);
       lastTick = currentTick;
       frames = 0;
    }
 
    printf("fps: %8.4f frames: %i\r", fps, total_frames++);
-   fflush(stdout);
+//   fflush(stdout);
 
    /* enable this to profile the core without video output */
 #if 0
@@ -267,60 +267,83 @@ static bool ctr_frame(void* data, const void* frame,
       goto end;
 #endif
 
+   svcWaitSynchronization(gspEvents[GSPEVENT_P3D], 20000000);
+   svcClearEvent(gspEvents[GSPEVENT_P3D]);
+   svcWaitSynchronization(gspEvents[GSPEVENT_PPF], 20000000);
+   svcClearEvent(gspEvents[GSPEVENT_PPF]);
+
+   gfxSwapBuffersGpu();
+
+   if (ctr->vsync)
+         gspWaitForEvent(GSPEVENT_VBlank0, true);
+
+   ctrGuSetMemoryFill(true, (u32*)CTR_GPU_FRAMEBUFFER, 0x00000000,
+                    (u32*)(CTR_GPU_FRAMEBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
+                    0x201, (u32*)CTR_GPU_DEPTHBUFFER, 0x00000000,
+                    (u32*)(CTR_GPU_DEPTHBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
+                    0x201);
+
    GPUCMD_SetBufferOffset(0);
 
+   if (width > ctr->texture_width)
+      width = ctr->texture_width;
+   if (height > ctr->texture_height)
+      height = ctr->texture_height;
+
    if(frame)
    {
-      int i;
-      uint16_t* dst = (uint16_t*)ctr->texture_linear;
-      const uint8_t* src = frame;
-      if (width > ctr->texture_width)
-         width = ctr->texture_width;
-      if (height > ctr->texture_height)
-         height = ctr->texture_height;
-      for (i = 0; i < height; i++)
+      if(((((u32)(frame)) >= 0x14000000 && ((u32)(frame)) < 0x1c000000)) /* frame in linear memory */
+         && !((u32)frame & 0x7F)                                         /* 128-byte aligned */
+         && !((pitch) & 0xF))                                            /* 16-byte aligned */
       {
-         memcpy(dst, src, width * sizeof(uint16_t));
-         dst += ctr->texture_width;
-         src += pitch;
+         /* can copy the buffer directly with the GPU */
+         ctrGuCopyImage(false, frame, pitch / 2, height, CTRGU_RGB565, false,
+                        ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565,  true);
       }
-      GSPGPU_FlushDataCache(NULL, ctr->texture_linear,
-                            ctr->texture_width * ctr->texture_height * sizeof(uint16_t));
+      else
+      {
+         int i;
+         uint16_t* dst = (uint16_t*)ctr->texture_linear;
+         const uint8_t* src = frame;
+         for (i = 0; i < height; i++)
+         {
+            memcpy(dst, src, width * sizeof(uint16_t));
+            dst += ctr->texture_width;
+            src += pitch;
+         }
+         GSPGPU_FlushDataCache(NULL, ctr->texture_linear,
+                               ctr->texture_width * ctr->texture_height * sizeof(uint16_t));
 
-      ctrGuCopyImage(ctr->texture_linear, ctr->texture_width, ctr->menu.texture_height, CTRGU_RGB565, false,
-                     ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565,  true);
-
-      gspWaitForEvent(GSPEVENT_PPF, false);
-
-
-      ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->texture_swizzled), ctr->texture_width, ctr->texture_height,
-                     GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
-                     GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
-                     GPU_RGB565);
-
-      ctr->frame_coords->u = width;
-      ctr->frame_coords->v = height;
-      GSPGPU_FlushDataCache(NULL, (u8*)ctr->frame_coords, sizeof(ctr_vertex_t));
-
-      ctrGuSetAttributeBuffersAddress(VIRT_TO_PHYS(ctr->frame_coords));
-      ctrGuSetVertexShaderFloatUniform(0, (float*)&ctr->scale_vector, 1);
-      GPU_DrawArray(GPU_UNKPRIM, 1);
+         ctrGuCopyImage(false, ctr->texture_linear, ctr->texture_width, ctr->menu.texture_height, CTRGU_RGB565, false,
+                        ctr->texture_swizzled, ctr->texture_width, CTRGU_RGB565,  true);
 
+      }
 
    }
 
+
+   ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->texture_swizzled), ctr->texture_width, ctr->texture_height,
+                  GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
+                  GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
+                  GPU_RGB565);
+
+   ctr->frame_coords->u = width;
+   ctr->frame_coords->v = height;
+   GSPGPU_FlushDataCache(NULL, (u8*)ctr->frame_coords, sizeof(ctr_vertex_t));
+
+   ctrGuSetAttributeBuffersAddress(VIRT_TO_PHYS(ctr->frame_coords));
+   ctrGuSetVertexShaderFloatUniform(0, (float*)&ctr->scale_vector, 1);
+   GPU_DrawArray(GPU_UNKPRIM, 1);
+
    if (ctr->menu_texture_enable)
    {
 
       GSPGPU_FlushDataCache(NULL, ctr->menu.texture_linear,
                             ctr->menu.texture_width * ctr->menu.texture_height * sizeof(uint16_t));
 
-      ctrGuCopyImage(ctr->menu.texture_linear, ctr->menu.texture_width, ctr->menu.texture_height, CTRGU_RGBA4444,false,
+      ctrGuCopyImage(false, ctr->menu.texture_linear, ctr->menu.texture_width, ctr->menu.texture_height, CTRGU_RGBA4444,false,
                      ctr->menu.texture_swizzled, ctr->menu.texture_width, CTRGU_RGBA4444,  true);
 
-      gspWaitForEvent(GSPEVENT_PPF, false);
-
-
       ctrGuSetTexture(GPU_TEXUNIT0, VIRT_TO_PHYS(ctr->menu.texture_swizzled), ctr->menu.texture_width, ctr->menu.texture_height,
                      GPU_TEXTURE_MAG_FILTER(GPU_LINEAR) | GPU_TEXTURE_MIN_FILTER(GPU_LINEAR) |
                      GPU_TEXTURE_WRAP_S(GPU_CLAMP_TO_EDGE) | GPU_TEXTURE_WRAP_T(GPU_CLAMP_TO_EDGE),
@@ -334,27 +357,14 @@ static bool ctr_frame(void* data, const void* frame,
 
    GPU_FinishDrawing();
    GPUCMD_Finalize();
-   GPUCMD_FlushAndRun(NULL);
-   gspWaitForEvent(GSPEVENT_P3D, false);
+   ctrGuFlushAndRun(true);
 
-   ctrGuDisplayTransfer(CTR_GPU_FRAMEBUFFER, 240,400, CTRGU_RGBA8,
+   ctrGuDisplayTransfer(true, CTR_GPU_FRAMEBUFFER, 240,400, CTRGU_RGBA8,
                         gfxGetFramebuffer(GFX_TOP, GFX_LEFT, NULL, NULL), 240,400,CTRGU_RGB8, CTRGU_MULTISAMPLE_NONE);
 
-   gspWaitForEvent(GSPEVENT_PPF, false);
-
-   GX_SetMemoryFill(NULL, (u32*)CTR_GPU_FRAMEBUFFER, 0x00000000,
-                    (u32*)(CTR_GPU_FRAMEBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
-                    0x201, (u32*)CTR_GPU_DEPTHBUFFER, 0x00000000,
-                    (u32*)(CTR_GPU_DEPTHBUFFER + CTR_TOP_FRAMEBUFFER_WIDTH * CTR_TOP_FRAMEBUFFER_HEIGHT * sizeof(uint32_t)),
-                    0x201);
-
-   gspWaitForEvent(GSPEVENT_PSC0, false);
-   gfxSwapBuffersGpu();
-
-//   if (ctr->vsync)
-//      gspWaitForEvent(GSPEVENT_VBlank0, true);
 
 end:
+//   gspWaitForEvent(GSPEVENT_VBlank0, true);
    RARCH_PERFORMANCE_STOP(ctrframe_f);
    return true;
 }
diff --git a/gfx/drivers/ctr_gu.h b/gfx/drivers/ctr_gu.h
index 75fa426090..629f5537e4 100644
--- a/gfx/drivers/ctr_gu.h
+++ b/gfx/drivers/ctr_gu.h
@@ -48,36 +48,13 @@
 #define CTRGU_MULTISAMPLE_2x1       (1 << 24)
 #define CTRGU_MULTISAMPLE_2x2       (2 << 24)
 
-typedef struct
-{
-   uint32_t buffer[8];
-} gtrgu_gx_command_t;
+#define CTR_CPU_TICKS_PER_SECOND    268123480
 
-__attribute__((always_inline))
-static INLINE int ctrGuWriteDisplayTransferCommand(gtrgu_gx_command_t* command,
-                                         void* src, int src_w, int src_h,
-                                         void* dst, int dst_w, int dst_h,
-                                         uint32_t flags)
-{
-   command->buffer[0] = 0x03; //CommandID
-   command->buffer[1] = (uint32_t)src;
-   command->buffer[2] = (uint32_t)dst;
-   command->buffer[3] = CTRGU_SIZE(src_w, src_h);
-   command->buffer[4] = CTRGU_SIZE(dst_w, dst_h);
-   command->buffer[5] = flags;
-   command->buffer[6] = 0x0;
-   command->buffer[7] = 0x0;
-
-   return 0;
-}
-
-__attribute__((always_inline))
-static INLINE int ctrGuSubmitGxCommand(u32* gxbuf, gtrgu_gx_command_t* command)
-{
-   if(!gxbuf) gxbuf = gxCmdBuf;
-
-   return GSPGPU_SubmitGxCommand(gxbuf, (u32*)command, NULL);
-}
+extern Handle gspEvents[GSPEVENT_MAX];
+extern u32* gpuCmdBuf;
+extern u32 gpuCmdBufOffset;
+extern u32 __linear_heap_size;
+extern u32* __linear_heap;
 
 __attribute__((always_inline))
 static INLINE void ctrGuSetTexture(GPU_TEXUNIT unit, u32* data,
@@ -108,14 +85,68 @@ static INLINE void ctrGuSetTexture(GPU_TEXUNIT unit, u32* data,
    }
 }
 
+__attribute__((always_inline))
+static INLINE Result ctrGuSetCommandList_First(bool queued, u32* buf0a, u32 buf0s, u32* buf1a, u32 buf1s, u32* buf2a, u32 buf2s)
+{
+   u32 gxCommand[0x8];
+   gxCommand[0]=0x05 | (queued? 0x01000000 : 0x0); //CommandID
+   gxCommand[1]=(u32)buf0a; //buf0 address
+   gxCommand[2]=(u32)buf0s; //buf0 size
+   gxCommand[3]=(u32)buf1a; //buf1 address
+   gxCommand[4]=(u32)buf1s; //buf1 size
+   gxCommand[5]=(u32)buf2a; //buf2 address
+   gxCommand[6]=(u32)buf2s; //buf2 size
+   gxCommand[7]=0x0;
+
+   return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
+}
+
+__attribute__((always_inline))
+static INLINE Result ctrGuSetCommandList_Last(bool queued, u32* buf0a, u32 buf0s, u8 flags)
+{
+   u32 gxCommand[0x8];
+   gxCommand[0]=0x01 | (queued? 0x01000000 : 0x0); //CommandID
+   gxCommand[1]=(u32)buf0a; //buf0 address
+   gxCommand[2]=(u32)buf0s; //buf0 size
+   gxCommand[3]=flags&1; //written to GSP module state
+   gxCommand[4]=gxCommand[5]=gxCommand[6]=0x0;
+   gxCommand[7]=(flags>>1)&1; //when non-zero, call svcFlushProcessDataCache() with the specified buffer
+
+   return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
+}
+
+__attribute__((always_inline))
+static INLINE void ctrGuFlushAndRun(bool queued)
+{
+   //take advantage of GX_SetCommandList_First to flush gsp heap
+   ctrGuSetCommandList_First(queued, gpuCmdBuf, gpuCmdBufOffset*4, __linear_heap, __linear_heap_size, NULL, 0);
+   ctrGuSetCommandList_Last(queued, gpuCmdBuf, gpuCmdBufOffset*4, 0x0);
+}
+
+__attribute__((always_inline))
+static INLINE Result ctrGuSetMemoryFill(bool queued, u32* buf0a, u32 buf0v, u32* buf0e, u16 width0, u32* buf1a, u32 buf1v, u32* buf1e, u16 width1)
+{
+   u32 gxCommand[0x8];
+   gxCommand[0]=0x02 | (queued? 0x01000000 : 0x0); //CommandID
+   gxCommand[1]=(u32)buf0a; //buf0 address
+   gxCommand[2]=buf0v; //buf0 value
+   gxCommand[3]=(u32)buf0e; //buf0 end addr
+   gxCommand[4]=(u32)buf1a; //buf1 address
+   gxCommand[5]=buf1v; //buf1 value
+   gxCommand[6]=(u32)buf1e; //buf1 end addr
+   gxCommand[7]=(width0)|(width1<<16);
+
+   return GSPGPU_SubmitGxCommand(gxCmdBuf, gxCommand, NULL);
+}
 
 __attribute__((always_inline))
 static INLINE Result ctrGuCopyImage
-      (void* src, int src_w, int src_h, int src_fmt, bool src_is_tiled,
-       void* dst, int dst_w,            int dst_fmt, bool dst_is_tiled)
+      (bool queued,
+       const void* src, int src_w, int src_h, int src_fmt, bool src_is_tiled,
+             void* dst, int dst_w,            int dst_fmt, bool dst_is_tiled)
 {
    u32 gxCommand[0x8];
-   gxCommand[0]=0x03; //CommandID
+   gxCommand[0]=0x03 | (queued? 0x01000000 : 0x0); //CommandID
    gxCommand[1]=(u32)src;
    gxCommand[2]=(u32)dst;
    gxCommand[3]=dst_w&0xFF8;
@@ -133,11 +164,12 @@ static INLINE Result ctrGuCopyImage
 
 __attribute__((always_inline))
 static INLINE Result ctrGuDisplayTransfer
-      (void* src, int src_w, int src_h, int src_fmt,
+     (bool queued,
+      void* src, int src_w, int src_h, int src_fmt,
       void* dst, int dst_w, int dst_h, int dst_fmt, int multisample_lvl)
 {
    u32 gxCommand[0x8];
-   gxCommand[0]=0x03; //CommandID
+   gxCommand[0]=0x03 | (queued? 0x01000000 : 0x0); //CommandID
    gxCommand[1]=(u32)src;
    gxCommand[2]=(u32)dst;
    gxCommand[3]=CTRGU_SIZE(dst_w, dst_h);