diff --git a/Makefile b/Makefile
index f5efeb0449..e4b82ce5fa 100644
--- a/Makefile
+++ b/Makefile
@@ -17,6 +17,10 @@ AR      = $(PREFIX)-gcc-ar
 CFLAGS  = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize -DSTB_DXT_IMPLEMENTATION
 ASFLAGS = $(CFLAGS)
 
+ifeq ($(NO_DEBUG),1)
+CFLAGS  += -DSKIP_ERROR_HANDLING
+endif
+
 all: $(TARGET).a
 
 $(TARGET).a: $(OBJS)
diff --git a/source/custom_shaders.c b/source/custom_shaders.c
index d3d5beeb34..7ff792b2e4 100644
--- a/source/custom_shaders.c
+++ b/source/custom_shaders.c
@@ -357,8 +357,7 @@ void glUniformMatrix4fv(GLint location, GLsizei count, GLboolean transpose, cons
  * ------------------------------
  */
 
-// Equivalent of glBindAttribLocation but for sceGxm architecture
-void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
+void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset) {
 	// Grabbing passed program
 	program *p = &progs[prog - 1];
 	SceGxmVertexAttribute *attributes = &p->attr[index];
@@ -369,7 +368,7 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const
 
 	// Setting stream index and offset values
 	attributes->streamIndex = index;
-	attributes->offset = 0;
+	attributes->offset = offset;
 
 	// Detecting attribute format and size
 	int bpe;
@@ -396,7 +395,12 @@ void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const
 		p->attr_num = index + 1;
 }
 
-// Equivalent of glVertexAttribLocation but for sceGxm architecture
+// Equivalent of glBindAttribLocation but for sceGxm architecture
+void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type) {
+	vglBindPackedAttribLocation(prog, index, name, num, type, 0);
+}
+
+// Equivalent of glVertexAttribPointer but for sceGxm architecture
 void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer) {
 #ifndef SKIP_ERROR_HANDLING
 	// Error handling
diff --git a/source/textures.c b/source/textures.c
index 8533e38f0f..a16d58cfec 100644
--- a/source/textures.c
+++ b/source/textures.c
@@ -101,6 +101,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 
 	SceGxmTextureFormat tex_format;
 	uint8_t data_bpp = 0;
+	uint8_t fast_store = GL_FALSE;
 
 	// Support for legacy GL1.0 internalFormat
 	switch (internalFormat) {
@@ -157,7 +158,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		switch (type) {
 		case GL_UNSIGNED_BYTE:
 			data_bpp = 3;
-			read_cb = readRGB;
+			if (internalFormat == GL_RGB) fast_store = GL_TRUE;
+			else read_cb = readRGB;
 			break;
 		default:
 			error = GL_INVALID_ENUM;
@@ -168,7 +170,8 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		switch (type) {
 		case GL_UNSIGNED_BYTE:
 			data_bpp = 4;
-			read_cb = readRGBA;
+			if (internalFormat == GL_RGBA) fast_store = GL_TRUE;
+			else read_cb = readRGBA;
 			break;
 		case GL_UNSIGNED_SHORT_5_5_5_1:
 			data_bpp = 2;
@@ -236,7 +239,7 @@ void glTexImage2D(GLenum target, GLint level, GLint internalFormat, GLsizei widt
 		tex->type = internalFormat;
 		tex->write_cb = write_cb;
 		if (level == 0)
-			if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb);
+			if (tex->write_cb) gpu_alloc_texture(width, height, tex_format, data, tex, data_bpp, read_cb, write_cb, fast_store);
 			else gpu_alloc_compressed_texture(width, height, tex_format, data, tex, data_bpp, read_cb);
 		else {
 			gpu_alloc_mipmaps(level, tex);
diff --git a/source/utils/gpu_utils.c b/source/utils/gpu_utils.c
index 17ff1fb82f..8497a8de7d 100644
--- a/source/utils/gpu_utils.c
+++ b/source/utils/gpu_utils.c
@@ -255,7 +255,7 @@ void gpu_free_texture(texture *tex) {
 	tex->valid = 0;
 }
 
-void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t)) {
+void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store) {
 	// If there's already a texture in passed texture object we first dealloc it
 	if (tex->valid)
 		gpu_free_texture(tex);
@@ -274,13 +274,22 @@ void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const
 			int i, j;
 			uint8_t *src = (uint8_t *)data;
 			uint8_t *dst;
-			for (i = 0; i < h; i++) {
-				dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
-				for (j = 0; j < w; j++) {
-					uint32_t clr = read_cb(src);
-					write_cb(dst, clr);
-					src += src_bpp;
-					dst += bpp;
+			if (fast_store) { // Internal Format and Data Format are the same, we can just use memcpy for better performance
+				uint32_t line_size = w * bpp;
+				for (i = 0; i < h; i++) {
+					dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
+					memcpy(dst, src, line_size);
+					src += line_size;
+				}
+			} else { // Different internal and data formats, we need to go with slower callbacks system
+				for (i = 0; i < h; i++) {
+					dst = ((uint8_t *)texture_data) + (ALIGN(w, 8) * bpp) * i;
+					for (j = 0; j < w; j++) {
+						uint32_t clr = read_cb(src);
+						write_cb(dst, clr);
+						src += src_bpp;
+						dst += bpp;
+					}
 				}
 			}
 		} else
diff --git a/source/utils/gpu_utils.h b/source/utils/gpu_utils.h
index d5a93efc87..15deb02c5e 100644
--- a/source/utils/gpu_utils.h
+++ b/source/utils/gpu_utils.h
@@ -82,7 +82,7 @@ void gpu_pool_init(uint32_t temp_pool_size);
 int tex_format_to_bytespp(SceGxmTextureFormat format);
 
 // Alloc a texture
-void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t));
+void gpu_alloc_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *), void (*write_cb)(void *, uint32_t), uint8_t fast_store);
 
 // Alloc a compresseed texture
 void gpu_alloc_compressed_texture(uint32_t w, uint32_t h, SceGxmTextureFormat format, const void *data, texture *tex, uint8_t src_bpp, uint32_t (*read_cb)(void *));
diff --git a/source/vitaGL.h b/source/vitaGL.h
index 4cfe1d5721..6498d8dc5b 100644
--- a/source/vitaGL.h
+++ b/source/vitaGL.h
@@ -378,6 +378,7 @@ void vglVertexPointerMapped(const GLvoid *pointer);
 
 // VGL_EXT_gxp_shaders extension implementation
 void vglBindAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type);
+void vglBindPackedAttribLocation(GLuint prog, GLuint index, const GLchar *name, const GLuint num, const GLenum type, GLuint offset);
 void vglVertexAttribPointer(GLuint index, GLint size, GLenum type, GLboolean normalized, GLsizei stride, GLuint count, const GLvoid *pointer);
 void vglVertexAttribPointerMapped(GLuint index, const GLvoid *pointer);