// Copyright (C) 2003 Dolphin Project.

// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation, version 2.0.

// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License 2.0 for more details.

// A copy of the GPL 2.0 should have been included with the program.
// If not, see http://www.gnu.org/licenses/

// Official SVN repository and contact information can be found at
// http://code.google.com/p/dolphin-emu/

// Fast image conversion using OpenGL shaders.
// This kind of stuff would be a LOT nicer with OpenCL.

#include <math.h>

#include "TextureConverter.h"
#include "TextureConversionShader.h"
#include "TextureCache.h"
#include "PixelShaderCache.h"
#include "ProgramShaderCache.h"
#include "VertexShaderManager.h"
#include "FramebufferManager.h"
#include "Globals.h"
#include "VideoConfig.h"
#include "ImageWrite.h"
#include "Render.h"
#include "FileUtil.h"
#include "HW/Memmap.h"

namespace OGL
{

namespace TextureConverter
{

using OGL::TextureCache;

static GLuint s_texConvFrameBuffer = 0;
static GLuint s_srcTexture = 0;			// for decoding from RAM
static GLuint s_srcTextureWidth = 0;
static GLuint s_srcTextureHeight = 0;
static GLuint s_dstRenderBuffer = 0;	// for encoding to RAM

const int renderBufferWidth = 1024;
const int renderBufferHeight = 1024;

static FRAGMENTSHADER s_rgbToYuyvProgram;
static FRAGMENTSHADER s_yuyvToRgbProgram;

// Not all slots are taken - but who cares.
const u32 NUM_ENCODING_PROGRAMS = 64;
static FRAGMENTSHADER s_encodingPrograms[NUM_ENCODING_PROGRAMS];

static GLuint s_encode_VBO = 0;
static GLuint s_encode_VAO = 0;
static GLuint s_decode_VBO = 0;
static GLuint s_decode_VAO = 0;
static TargetRectangle s_cached_sourceRc;
static int s_cached_srcWidth = 0;
static int s_cached_srcHeight = 0;

void CreateRgbToYuyvProgram()
{
	// Output is BGRA because that is slightly faster than RGBA.
	if (g_ActiveConfig.backend_info.bSupportsGLSLBinding)
	{
		const char *FProgram =
		"#version 330 compatibility\n"
		"#extension GL_ARB_texture_rectangle : enable\n"
		"#extension GL_ARB_shading_language_420pack : enable\n"
		"layout(binding = 0) uniform sampler2DRect samp0;\n"
		"void main()\n"
		"{\n"
		"  vec2 uv1 = vec2(gl_TexCoord[0].x + 1.0f, gl_TexCoord[0].y);\n"
		"  vec3 c0 = texture2DRect(samp0, gl_TexCoord[0].xy).rgb;\n"
		"  vec3 c1 = texture2DRect(samp0, uv1).rgb;\n"
		"  vec3 y_const = vec3(0.257f,0.504f,0.098f);\n"
		"  vec3 u_const = vec3(-0.148f,-0.291f,0.439f);\n"
		"  vec3 v_const = vec3(0.439f,-0.368f,-0.071f);\n"
		"  vec4 const3 = vec4(0.0625f,0.5f,0.0625f,0.5f);\n"
		"  vec3 c01 = (c0 + c1) * 0.5f;\n"
		"  gl_FragData[0] = vec4(dot(c1,y_const),dot(c01,u_const),dot(c0,y_const),dot(c01, v_const)) + const3;\n"
		"}\n";
		if (!PixelShaderCache::CompilePixelShader(s_rgbToYuyvProgram, FProgram))
			ERROR_LOG(VIDEO, "Failed to create RGB to YUYV fragment program.");
	}
	else
	{
		const char *FProgram =
		"#version 120\n"
		"#extension GL_ARB_texture_rectangle : enable\n"
		"uniform sampler2DRect samp0;\n"
		"void main()\n"
		"{\n"
		"  vec2 uv1 = vec2(gl_TexCoord[0].x + 1.0f, gl_TexCoord[0].y);\n"
		"  vec3 c0 = texture2DRect(samp0, gl_TexCoord[0].xy).rgb;\n"
		"  vec3 c1 = texture2DRect(samp0, uv1).rgb;\n"
		"  vec3 y_const = vec3(0.257f,0.504f,0.098f);\n"
		"  vec3 u_const = vec3(-0.148f,-0.291f,0.439f);\n"
		"  vec3 v_const = vec3(0.439f,-0.368f,-0.071f);\n"
		"  vec4 const3 = vec4(0.0625f,0.5f,0.0625f,0.5f);\n"
		"  vec3 c01 = (c0 + c1) * 0.5f;\n"
		"  gl_FragData[0] = vec4(dot(c1,y_const),dot(c01,u_const),dot(c0,y_const),dot(c01, v_const)) + const3;\n"
		"}\n";
		if (!PixelShaderCache::CompilePixelShader(s_rgbToYuyvProgram, FProgram))
			ERROR_LOG(VIDEO, "Failed to create RGB to YUYV fragment program.");
	}
}

void CreateYuyvToRgbProgram()
{
	if (g_ActiveConfig.backend_info.bSupportsGLSLBinding)
	{
		const char *FProgram =
			"#version 330 compatibility\n"
			"#extension GL_ARB_texture_rectangle : enable\n"
			"#extension GL_ARB_shading_language_420pack : enable\n"
			"layout(binding = 0) uniform sampler2DRect samp0;\n"
			"void main()\n"
			"{\n"
			"  vec4 c0 = texture2DRect(samp0, gl_TexCoord[0].xy).rgba;\n"

			"  float f = step(0.5, fract(gl_TexCoord[0].x));\n"
			"  float y = mix(c0.b, c0.r, f);\n"
			"  float yComp = 1.164f * (y - 0.0625f);\n"
			"  float uComp = c0.g - 0.5f;\n"
			"  float vComp = c0.a - 0.5f;\n"

			"  gl_FragData[0] = vec4(yComp + (1.596f * vComp),\n"
			"                 yComp - (0.813f * vComp) - (0.391f * uComp),\n"
			"                 yComp + (2.018f * uComp),\n"
			"                 1.0f);\n"
			"}\n";
		if (!PixelShaderCache::CompilePixelShader(s_yuyvToRgbProgram, FProgram))
			ERROR_LOG(VIDEO, "Failed to create YUYV to RGB fragment program.");
	}
	else
	{
		const char *FProgram =
			"#version 120\n"
			"#ifdef GL_ARB_texture_rectangle\n"
			"#extension GL_ARB_texture_rectangle : require\n"
			"#endif\n"
			"uniform sampler2DRect samp0;\n"
			"void main()\n"
			"{\n"
			"  vec4 c0 = texture2DRect(samp0, gl_TexCoord[0].xy).rgba;\n"

			"  float f = step(0.5, fract(gl_TexCoord[0].x));\n"
			"  float y = mix(c0.b, c0.r, f);\n"
			"  float yComp = 1.164f * (y - 0.0625f);\n"
			"  float uComp = c0.g - 0.5f;\n"
			"  float vComp = c0.a - 0.5f;\n"

			"  gl_FragData[0] = vec4(yComp + (1.596f * vComp),\n"
			"                 yComp - (0.813f * vComp) - (0.391f * uComp),\n"
			"                 yComp + (2.018f * uComp),\n"
			"                 1.0f);\n"
			"}\n";
		if (!PixelShaderCache::CompilePixelShader(s_yuyvToRgbProgram, FProgram))
			ERROR_LOG(VIDEO, "Failed to create YUYV to RGB fragment program.");
	}
}

FRAGMENTSHADER &GetOrCreateEncodingShader(u32 format)
{
	if (format > NUM_ENCODING_PROGRAMS)
	{
		PanicAlert("Unknown texture copy format: 0x%x\n", format);
		return s_encodingPrograms[0];
	}

	if (s_encodingPrograms[format].glprogid == 0)
	{
		const char* shader = TextureConversionShader::GenerateEncodingShader(format, API_GLSL);

#if defined(_DEBUG) || defined(DEBUGFAST)
		if (g_ActiveConfig.iLog & CONF_SAVESHADERS && shader)
		{
			static int counter = 0;
			char szTemp[MAX_PATH];
			sprintf(szTemp, "%senc_%04i.txt", File::GetUserPath(D_DUMP_IDX).c_str(), counter++);

			SaveData(szTemp, shader);
		}
#endif

		if (!PixelShaderCache::CompilePixelShader(s_encodingPrograms[format], shader)) {
			ERROR_LOG(VIDEO, "Failed to create encoding fragment program");
		}
    }
	return s_encodingPrograms[format];
}

void Init()
{
	glGenFramebuffersEXT(1, &s_texConvFrameBuffer);
	
	glGenBuffers(1, &s_encode_VBO );
	glGenVertexArrays(1, &s_encode_VAO );
	glBindBuffer(GL_ARRAY_BUFFER, s_encode_VBO );
	glBindVertexArray( s_encode_VAO );
	glEnableClientState(GL_VERTEX_ARRAY);
	glVertexPointer(2, GL_FLOAT, 4*sizeof(GLfloat), NULL);
	glClientActiveTexture(GL_TEXTURE0);
	glEnableClientState(GL_TEXTURE_COORD_ARRAY);
	glTexCoordPointer(2, GL_FLOAT, 4*sizeof(GLfloat), (GLfloat*)NULL + 2);
	s_cached_sourceRc.top = -1;
	s_cached_sourceRc.bottom = -1;
	s_cached_sourceRc.left = -1;
	s_cached_sourceRc.right = -1;
	
	glGenBuffers(1, &s_decode_VBO );
	glGenVertexArrays(1, &s_decode_VAO );
	glBindBuffer(GL_ARRAY_BUFFER, s_decode_VBO );
	glBindVertexArray( s_decode_VAO );
	s_cached_srcWidth = -1;
	s_cached_srcHeight = -1;
	glEnableClientState(GL_VERTEX_ARRAY);
	glVertexPointer(2, GL_FLOAT, sizeof(GLfloat)*4, NULL);
	glClientActiveTexture(GL_TEXTURE0);
	glEnableClientState(GL_TEXTURE_COORD_ARRAY);
	glTexCoordPointer(2, GL_FLOAT, sizeof(GLfloat)*4, (GLfloat*)NULL+2);
	
	// TODO: this after merging with graphic_update
	glBindBuffer(GL_ARRAY_BUFFER, 0);
	glBindVertexArray(0);

	glGenRenderbuffersEXT(1, &s_dstRenderBuffer);
	glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, s_dstRenderBuffer);
	
	glRenderbufferStorageEXT(GL_RENDERBUFFER_EXT, GL_RGBA, renderBufferWidth, renderBufferHeight);

	s_srcTextureWidth = 0;
	s_srcTextureHeight = 0;

	glGenTextures(1, &s_srcTexture);
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, s_srcTexture);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
	glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);

	CreateRgbToYuyvProgram();
	CreateYuyvToRgbProgram();
}

void Shutdown()
{
	glDeleteTextures(1, &s_srcTexture);
	glDeleteRenderbuffersEXT(1, &s_dstRenderBuffer);
	glDeleteFramebuffersEXT(1, &s_texConvFrameBuffer);
	glDeleteBuffers(1, &s_encode_VBO );
	glDeleteVertexArrays(1, &s_encode_VAO );
	glDeleteBuffers(1, &s_decode_VBO );
	glDeleteVertexArrays(1, &s_decode_VAO );

	s_rgbToYuyvProgram.Destroy();
	s_yuyvToRgbProgram.Destroy();

	for (unsigned int i = 0; i < NUM_ENCODING_PROGRAMS; i++)
		s_encodingPrograms[i].Destroy();

	s_srcTexture = 0;
	s_dstRenderBuffer = 0;
	s_texConvFrameBuffer = 0;
}

void EncodeToRamUsingShader(GLuint srcTexture, const TargetRectangle& sourceRc,
					    u8* destAddr, int dstWidth, int dstHeight, int readStride,
						   	bool toTexture, bool linearFilter)
{


	// switch to texture converter frame buffer
	// attach render buffer as color destination
	FramebufferManager::SetFramebuffer(s_texConvFrameBuffer);

	glBindRenderbufferEXT(GL_RENDERBUFFER_EXT, s_dstRenderBuffer);
	glFramebufferRenderbufferEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_RENDERBUFFER_EXT, s_dstRenderBuffer);
	GL_REPORT_ERRORD();

	for (int i = 1; i < 8; ++i)
		TextureCache::DisableStage(i);

	// set source texture
	glActiveTexture(GL_TEXTURE0);
	glEnable(GL_TEXTURE_RECTANGLE_ARB);
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, srcTexture);

	if (linearFilter)
	{
		glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
		glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
	}
	else
	{
		glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
		glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
	}

	GL_REPORT_ERRORD();

	glViewport(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight);

	GL_REPORT_ERRORD();
	if(!(s_cached_sourceRc == sourceRc)) {
		GLfloat vertices[] = {
			-1.f, -1.f, 
			(float)sourceRc.left, (float)sourceRc.top,
			-1.f, 1.f,
			(float)sourceRc.left, (float)sourceRc.bottom,
			1.f, 1.f,
			(float)sourceRc.right, (float)sourceRc.bottom,
			1.f, -1.f,
			(float)sourceRc.right, (float)sourceRc.top
		};
		glBindBuffer(GL_ARRAY_BUFFER, s_encode_VBO );
		glBufferData(GL_ARRAY_BUFFER, 4*4*sizeof(GLfloat), vertices, GL_STREAM_DRAW);
		
		// TODO: this after merging with graphic_update
		glBindBuffer(GL_ARRAY_BUFFER, 0);
		
		s_cached_sourceRc = sourceRc;
	} 

	glBindVertexArray( s_encode_VAO );
	glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
	
	// TODO: this after merging with graphic_update
	glBindVertexArray(0);
	
	GL_REPORT_ERRORD();

	// .. and then read back the results.
	// TODO: make this less slow.

	int writeStride = bpmem.copyMipMapStrideChannels * 32;

	if (writeStride != readStride && toTexture)
	{
		// writing to a texture of a different size

		int readHeight = readStride / dstWidth;
		readHeight /= 4; // 4 bytes per pixel

		int readStart = 0;
		int readLoops = dstHeight / readHeight;
		for (int i = 0; i < readLoops; i++)
		{
			glReadPixels(0, readStart, (GLsizei)dstWidth, (GLsizei)readHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);
			readStart += readHeight;
			destAddr += writeStride;
		}
	}
	else
		glReadPixels(0, 0, (GLsizei)dstWidth, (GLsizei)dstHeight, GL_BGRA, GL_UNSIGNED_BYTE, destAddr);

	GL_REPORT_ERRORD();

}

int EncodeToRamFromTexture(u32 address,GLuint source_texture, bool bFromZBuffer, bool bIsIntensityFmt, u32 copyfmt, int bScaleByHalf, const EFBRectangle& source)
{
	u32 format = copyfmt;

	if (bFromZBuffer)
	{
		format |= _GX_TF_ZTF;
		if (copyfmt == 11)
			format = GX_TF_Z16;
		else if (format < GX_TF_Z8 || format > GX_TF_Z24X8)
			format |= _GX_TF_CTF;
	}
	else
		if (copyfmt > GX_TF_RGBA8 || (copyfmt < GX_TF_RGB565 && !bIsIntensityFmt))
			format |= _GX_TF_CTF;

	FRAGMENTSHADER& texconv_shader = GetOrCreateEncodingShader(format);
	if (texconv_shader.glprogid == 0)
		return 0;

	u8 *dest_ptr = Memory::GetPointer(address);

	int width = (source.right - source.left) >> bScaleByHalf;
	int height = (source.bottom - source.top) >> bScaleByHalf;

	int size_in_bytes = TexDecoder_GetTextureSizeInBytes(width, height, format);

	u16 blkW = TexDecoder_GetBlockWidthInTexels(format) - 1;
	u16 blkH = TexDecoder_GetBlockHeightInTexels(format) - 1;	
	u16 samples = TextureConversionShader::GetEncodedSampleCount(format);	

	// only copy on cache line boundaries
	// extra pixels are copied but not displayed in the resulting texture
	s32 expandedWidth = (width + blkW) & (~blkW);
	s32 expandedHeight = (height + blkH) & (~blkH);

	ProgramShaderCache::SetBothShaders(texconv_shader.glprogid, 0);
		
	float sampleStride = bScaleByHalf ? 2.f : 1.f;
	TextureConversionShader::SetShaderParameters((float)expandedWidth,
		(float)Renderer::EFBToScaledY(expandedHeight), // TODO: Why do we scale this?
		(float)Renderer::EFBToScaledX(source.left),
		(float)Renderer::EFBToScaledY(EFB_HEIGHT - source.top - expandedHeight),
		Renderer::EFBToScaledXf(sampleStride),
		Renderer::EFBToScaledYf(sampleStride));

	TargetRectangle scaledSource;
	scaledSource.top = 0;
	scaledSource.bottom = expandedHeight;
	scaledSource.left = 0;
	scaledSource.right = expandedWidth / samples;
	int cacheBytes = 32;
	if ((format & 0x0f) == 6)
		cacheBytes = 64;

	int readStride = (expandedWidth * cacheBytes) /
		TexDecoder_GetBlockWidthInTexels(format);
	EncodeToRamUsingShader(source_texture, scaledSource,
		dest_ptr, expandedWidth / samples, expandedHeight, readStride,
		true, bScaleByHalf > 0 && !bFromZBuffer);
	return size_in_bytes; // TODO: D3D11 is calculating this value differently!

}

void EncodeToRamYUYV(GLuint srcTexture, const TargetRectangle& sourceRc, u8* destAddr, int dstWidth, int dstHeight)
{
	g_renderer->ResetAPIState();
	
	ProgramShaderCache::SetBothShaders(s_rgbToYuyvProgram.glprogid, 0);
		
	EncodeToRamUsingShader(srcTexture, sourceRc, destAddr, dstWidth / 2, dstHeight, 0, false, false);
	FramebufferManager::SetFramebuffer(0);
	VertexShaderManager::SetViewportChanged();
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, 0);
    TextureCache::DisableStage(0);
	g_renderer->RestoreAPIState();
	GL_REPORT_ERRORD();
}


// Should be scale free.
void DecodeToTexture(u32 xfbAddr, int srcWidth, int srcHeight, GLuint destTexture)
{
	u8* srcAddr = Memory::GetPointer(xfbAddr);
	if (!srcAddr)
	{
		WARN_LOG(VIDEO, "Tried to decode from invalid memory address");
		return;
	}

	int srcFmtWidth = srcWidth / 2;

	g_renderer->ResetAPIState(); // reset any game specific settings

	// switch to texture converter frame buffer
	// attach destTexture as color destination
	FramebufferManager::SetFramebuffer(s_texConvFrameBuffer);
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, destTexture);
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_RECTANGLE_ARB, destTexture, 0);

	GL_REPORT_FBO_ERROR();

    for (int i = 1; i < 8; ++i)
		TextureCache::DisableStage(i);

	// activate source texture
	// set srcAddr as data for source texture
	glActiveTexture(GL_TEXTURE0);
	glEnable(GL_TEXTURE_RECTANGLE_ARB);
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, s_srcTexture);

	// TODO: make this less slow.  (How?)
	if ((GLsizei)s_srcTextureWidth == (GLsizei)srcFmtWidth && (GLsizei)s_srcTextureHeight == (GLsizei)srcHeight)
	{
		glTexSubImage2D(GL_TEXTURE_RECTANGLE_ARB, 0,0,0,s_srcTextureWidth, s_srcTextureHeight,
				GL_BGRA, GL_UNSIGNED_BYTE, srcAddr);
	}
	else
	{
		glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA8, (GLsizei)srcFmtWidth, (GLsizei)srcHeight,
				0, GL_BGRA, GL_UNSIGNED_BYTE, srcAddr);
		s_srcTextureWidth = (GLsizei)srcFmtWidth;
		s_srcTextureHeight = (GLsizei)srcHeight;
	}

	glViewport(0, 0, srcWidth, srcHeight);
	ProgramShaderCache::SetBothShaders(s_yuyvToRgbProgram.glprogid, 0);

	GL_REPORT_ERRORD();
	
	if(s_cached_srcHeight != srcHeight || s_cached_srcWidth != srcWidth) {
		GLfloat vertices[] = {
			1.f, -1.f,
			(float)srcFmtWidth, (float)srcHeight,
			1.f, 1.f,
			(float)srcFmtWidth, 0.f,
			-1.f, 1.f,
			0.f, 0.f,
			-1.f, -1.f,
			0.f, (float)srcHeight
		};
		
		glBindBuffer(GL_ARRAY_BUFFER, s_decode_VBO );
		glBufferData(GL_ARRAY_BUFFER, sizeof(GLfloat)*4*4, vertices, GL_STREAM_DRAW);
		
		// TODO: this after merging with graphic_update
		glBindBuffer(GL_ARRAY_BUFFER, 0);
	
		s_cached_srcHeight = srcHeight;
		s_cached_srcWidth = srcWidth;
	}
	
	glBindVertexArray( s_decode_VAO );
	glDrawArrays(GL_TRIANGLE_FAN, 0, 4);
	
	// TODO: this after merging with graphic_update
	glBindVertexArray(0);
	
	GL_REPORT_ERRORD();

	// reset state
	glBindTexture(GL_TEXTURE_RECTANGLE_ARB, 0);
	glFramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT, GL_COLOR_ATTACHMENT0_EXT, GL_TEXTURE_RECTANGLE_ARB, 0, 0);
	TextureCache::DisableStage(0);

	VertexShaderManager::SetViewportChanged();

	FramebufferManager::SetFramebuffer(0);

	g_renderer->RestoreAPIState();
	GL_REPORT_ERRORD();
}

}  // namespace

}  // namespace OGL