From 059970e971984643f6b5e137b06510334c33f245 Mon Sep 17 00:00:00 2001 From: Sonicadvance1 Date: Wed, 7 Oct 2009 04:00:29 +0000 Subject: [PATCH] Fix XFB converting, silly mistake. Disable Texture converting ATM since it fails. And if can't read work group size, fall to 64, which is what my 8600GTS does. Tested XFB in Melee intro, got around 10FPS faster tPS faster then with CPU side git-svn-id: https://dolphin-emu.googlecode.com/svn/trunk@4371 8ced0084-cf51-0410-be5f-012b33b47a6e --- .../VideoCommon/Src/OpenCL/TextureDecoder.cpp | 10 ++++++---- Source/Core/VideoCommon/Src/XFBConvert.cpp | 19 ++++++++++--------- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/Source/Core/VideoCommon/Src/OpenCL/TextureDecoder.cpp b/Source/Core/VideoCommon/Src/OpenCL/TextureDecoder.cpp index 322d5dea64..e463d9d1f0 100644 --- a/Source/Core/VideoCommon/Src/OpenCL/TextureDecoder.cpp +++ b/Source/Core/VideoCommon/Src/OpenCL/TextureDecoder.cpp @@ -134,6 +134,7 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei default: return PC_TEX_FMT_NONE; }*/ + return PC_TEX_FMT_NONE; switch(texformat) { case GX_TF_I8: @@ -143,13 +144,13 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei printf("width %d, height %d\n", width, height); // Create the input and output arrays in device memory for our calculation // - cl_mem _dst = clCreateBuffer(OpenCL::g_context, CL_MEM_WRITE_ONLY, TexDecoder_GetTextureSizeInBytes(width, height, texformat), NULL, NULL); + cl_mem _dst = clCreateBuffer(OpenCL::g_context, CL_MEM_WRITE_ONLY, sizeof(unsigned char) * width * height, NULL, NULL); if (!dst) { printf("Error: Failed to allocate device memory!\n"); exit(1); } - cl_mem _src = clCreateBuffer(OpenCL::g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, TexDecoder_GetTextureSizeInBytes(width, height, texformat), (void*)src, NULL); + cl_mem _src = clCreateBuffer(OpenCL::g_context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(unsigned char) * width * height, (void*)src, NULL); if (!src) { printf("Error: Failed to allocate device memory!\n"); @@ -174,8 +175,9 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); - exit(1); + local = 64; } + // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // @@ -193,7 +195,7 @@ PC_TexFormat TexDecoder_Decode_OpenCL(u8 *dst, const u8 *src, int width, int hei // Read back the results from the device to verify the output // - err = clEnqueueReadBuffer( OpenCL::g_cmdq, _dst, CL_TRUE, 0, TexDecoder_GetTextureSizeInBytes(width, height, texformat), dst, 0, NULL, NULL ); + err = clEnqueueReadBuffer( OpenCL::g_cmdq, _dst, CL_TRUE, 0, sizeof(unsigned char) * width * height, dst, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); diff --git a/Source/Core/VideoCommon/Src/XFBConvert.cpp b/Source/Core/VideoCommon/Src/XFBConvert.cpp index fc6fcdd99c..b5cb4f5ab8 100644 --- a/Source/Core/VideoCommon/Src/XFBConvert.cpp +++ b/Source/Core/VideoCommon/Src/XFBConvert.cpp @@ -87,15 +87,14 @@ const char *__ConvertToXFB = "__kernel void ConvertToXFB(__global unsigned int { \n \ const unsigned char *src = _pEFB;\n \ int id = get_global_id(0);\n \ - src += id * 8; \n \ + int srcOffset = id * 8; \n \ \n \ - int y1 = (((16843 * src[0]) + (33030 * src[1]) + (6423 * src[2])) >> 16) + 16; \n \ - int u1 = ((-(9699 * src[0]) - (19071 * src[1]) + (28770 * src[2])) >> 16) + 128;\n \ - src += 4;\n \ + int y1 = (((16843 * src[srcOffset]) + (33030 * src[srcOffset + 1]) + (6423 * src[srcOffset + 2])) >> 16) + 16; \n \ + int u1 = ((-(9699 * src[srcOffset]) - (19071 * src[srcOffset + 1]) + (28770 * src[srcOffset + 2])) >> 16) + 128;\n \ + srcOffset += 4;\n \ \n \ - int y2 = (((16843 * src[0]) + (33030 * src[1]) + (6423 * src[2])) >> 16) + 16;\n \ - int v2 = (((28770 * src[0]) - (24117 * src[1]) - (4653 * src[2])) >> 16) + 128;\n \ - src += 4;\n \ + int y2 = (((16843 * src[srcOffset]) + (33030 * src[srcOffset + 1]) + (6423 * src[srcOffset + 2])) >> 16) + 16;\n \ + int v2 = (((28770 * src[srcOffset]) - (24117 * src[srcOffset + 1]) - (4653 * src[srcOffset + 2])) >> 16) + 128;\n \ \n \ dst[id] = (v2 << 24) | (y2 << 16) | (u1 << 8) | (y1); \n \ } \n "; @@ -178,8 +177,9 @@ void ConvertFromXFB(u32 *dst, const u8* _pXFB, int width, int height) if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); - exit(1); + local = 32; } + // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // @@ -283,8 +283,9 @@ void ConvertToXFB(u32 *dst, const u8* _pEFB, int width, int height) if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); - exit(1); + local = 64; } + // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device //